diff --git a/.gitignore b/.gitignore
index e88ccfc008c..9a281d67675 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ make_config.mk
 *.vcxproj.filters
 *.sln
 *.cmake
+.watchmanconfig
 CMakeCache.txt
 CMakeFiles/
 build/
@@ -32,6 +33,8 @@ ldb
 manifest_dump
 sst_dump
 blob_dump
+block_cache_trace_analyzer
+tools/block_cache_analyzer/*.pyc
 column_aware_encoding_exp
 util/build_version.cc
 build_tools/VALGRIND_LOGS/
@@ -47,6 +50,8 @@ rocksdb_undump
 db_test2
 trace_analyzer
 trace_analyzer_test
+block_cache_trace_analyzer
+.DS_Store
 
 java/out
 java/target
@@ -74,3 +79,4 @@ tp2/
 fbcode/
 fbcode
 buckifier/*.pyc
+buckifier/__pycache__
diff --git a/.travis.yml b/.travis.yml
index e759a642a0c..7af91483bca 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,4 @@
-sudo: false
-dist: trusty
+dist: xenial
 language: cpp
 os:
   - linux
@@ -9,7 +8,7 @@ compiler:
   - gcc
 osx_image: xcode8.3
 jdk:
-  - oraclejdk7
+  - openjdk7
 cache:
   - ccache
   - apt
@@ -71,7 +70,10 @@ install:
       CC=gcc-8 && CXX=g++-8;
     fi
   - if [[ "${JOB_NAME}" == cmake* ]] && [ "${TRAVIS_OS_NAME}" == linux ]; then
-      mkdir cmake-dist && curl -sfSL https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH;
+      mkdir cmake-dist && curl -sfSL https://github.com/Kitware/CMake/releases/download/v3.14.5/cmake-3.14.5-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH;
+    fi
+  - if [[ "${JOB_NAME}" == java_test ]]; then
+      java -version && echo "JAVA_HOME=${JAVA_HOME}";
     fi
 
 before_script:
@@ -87,13 +89,13 @@ script:
       OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 all_but_some_tests check_some
       ;;
     1)
-      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=full_filter_block_test make -j4 check_some
+      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=db_iter_test make -j4 check_some
       ;;
     2)
-      OPT=-DTRAVIS V=1 make -j4 tools && OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=full_filter_block_test ROCKSDBTESTS_END=write_batch_with_index_test make -j4 check_some
+      OPT=-DTRAVIS V=1 make -j4 tools && OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_iter_test ROCKSDBTESTS_END=options_file_test make -j4 check_some
       ;;
     3)
-      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_batch_with_index_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some
+      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=options_file_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some
       ;;
     4)
       OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_prepared_transaction_test make -j4 check_some
@@ -101,7 +103,7 @@ script:
     esac
   - case $JOB_NAME in
     java_test)
-      OPT=-DTRAVIS V=1 make clean jclean && make rocksdbjava jtest
+      OPT=-DTRAVIS V=1 make rocksdbjava jtest
       ;;
     lite_build)
       OPT='-DTRAVIS -DROCKSDB_LITE' V=1 make -j4 static_lib tools
@@ -110,6 +112,7 @@ script:
       OPT=-DTRAVIS V=1 make -j4 static_lib && cd examples && make -j4
       ;;
     cmake-mingw)
+      sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix;
       mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
       ;;
     cmake*)
@@ -119,5 +122,3 @@ script:
 notifications:
     email:
       - leveldb@fb.com
-    webhooks:
-      - https://buildtimetrend.herokuapp.com/travis
diff --git a/.watchmanconfig b/.watchmanconfig
new file mode 100644
index 00000000000..e5b450d7bbb
--- /dev/null
+++ b/.watchmanconfig
@@ -0,0 +1,6 @@
+{
+  "content_hash_warming": true,
+  "content_hash_max_items": 333333,
+  "hint_num_files_per_dir": 8,
+  "fsevents_latency": 0.05
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb8067d2245..4da0b3628ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,17 +32,24 @@
 # 3. cmake ..
 # 4. make -j
 
-cmake_minimum_required(VERSION 2.8.12)
-project(rocksdb)
-enable_language(CXX)
-enable_language(C)
-enable_language(ASM)
+cmake_minimum_required(VERSION 3.5.1)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
+include(ReadVersion)
+get_rocksdb_version(rocksdb_VERSION)
+project(rocksdb
+  VERSION ${rocksdb_VERSION}
+  LANGUAGES CXX C ASM)
 
 if(POLICY CMP0042)
   cmake_policy(SET CMP0042 NEW)
 endif()
 
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
+find_program(CCACHE_FOUND ccache)
+if(CCACHE_FOUND)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif(CCACHE_FOUND)
 
 option(WITH_JEMALLOC "build with JeMalloc" OFF)
 option(WITH_SNAPPY "build with SNAPPY" OFF)
@@ -53,6 +60,13 @@ option(WITH_WINDOWS_UTF8_FILENAMES "use UTF8 as characterset for opening files,
 if (WITH_WINDOWS_UTF8_FILENAMES)
   add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES)
 endif()
+# third-party/folly is only validated to work on Linux and Windows for now.
+# So only turn it on there by default.
+if(CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Windows")
+  option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" ON)
+else()
+  option(WITH_FOLLY_DISTRIBUTED_MUTEX "build with folly::DistributedMutex" OFF)
+endif()
 if(MSVC)
   # Defaults currently different for GFLAGS.
   #  We will address find_package work a little later
@@ -68,8 +82,7 @@ else()
     if(WITH_JEMALLOC)
       find_package(JeMalloc REQUIRED)
       add_definitions(-DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE)
-      include_directories(${JEMALLOC_INCLUDE_DIR})
-      list(APPEND THIRDPARTY_LIBS ${JEMALLOC_LIBRARIES})
+      list(APPEND THIRDPARTY_LIBS JeMalloc::JeMalloc)
     endif()
   endif()
 
@@ -87,47 +100,43 @@ else()
   if(WITH_SNAPPY)
     find_package(snappy REQUIRED)
     add_definitions(-DSNAPPY)
-    include_directories(${SNAPPY_INCLUDE_DIR})
-    list(APPEND THIRDPARTY_LIBS ${SNAPPY_LIBRARIES})
+    list(APPEND THIRDPARTY_LIBS snappy::snappy)
   endif()
 
   if(WITH_ZLIB)
     find_package(ZLIB REQUIRED)
     add_definitions(-DZLIB)
-    if(ZLIB_INCLUDE_DIRS)
-      # CMake 3
-      include_directories(${ZLIB_INCLUDE_DIRS})
-    else()
-      # CMake 2
-      include_directories(${ZLIB_INCLUDE_DIR})
-    endif()
-    list(APPEND THIRDPARTY_LIBS ${ZLIB_LIBRARIES})
+    list(APPEND THIRDPARTY_LIBS ZLIB::ZLIB)
   endif()
 
   option(WITH_BZ2 "build with bzip2" OFF)
   if(WITH_BZ2)
-    find_package(bzip2 REQUIRED)
+    find_package(BZip2 REQUIRED)
     add_definitions(-DBZIP2)
-    include_directories(${BZIP2_INCLUDE_DIR})
+    if(BZIP2_INCLUDE_DIRS)
+      include_directories(${BZIP2_INCLUDE_DIRS})
+    else()
+      include_directories(${BZIP2_INCLUDE_DIR})
+    endif()
     list(APPEND THIRDPARTY_LIBS ${BZIP2_LIBRARIES})
   endif()
 
   if(WITH_LZ4)
     find_package(lz4 REQUIRED)
     add_definitions(-DLZ4)
-    include_directories(${LZ4_INCLUDE_DIR})
-    list(APPEND THIRDPARTY_LIBS ${LZ4_LIBRARIES})
+    list(APPEND THIRDPARTY_LIBS lz4::lz4)
   endif()
 
   if(WITH_ZSTD)
     find_package(zstd REQUIRED)
     add_definitions(-DZSTD)
     include_directories(${ZSTD_INCLUDE_DIR})
-    list(APPEND THIRDPARTY_LIBS ${ZSTD_LIBRARIES})
+    list(APPEND THIRDPARTY_LIBS zstd::zstd)
   endif()
 endif()
 
-string(TIMESTAMP GIT_DATE_TIME "%Y/%m/%d %H:%M:%S" UTC)
+string(TIMESTAMP TS "%Y/%m/%d %H:%M:%S" UTC)
+set(GIT_DATE_TIME "${TS}" CACHE STRING "the time we first built rocksdb")
 
 find_package(Git)
 
@@ -144,17 +153,6 @@ endif()
 string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}")
 
 
-# Read rocksdb version from version.h header file.
-file(READ include/rocksdb/version.h version_header_file)
-string(REGEX MATCH "#define ROCKSDB_MAJOR ([0-9]+)" _ ${version_header_file})
-set(ROCKSDB_VERSION_MAJOR ${CMAKE_MATCH_1})
-string(REGEX MATCH "#define ROCKSDB_MINOR ([0-9]+)" _ ${version_header_file})
-set(ROCKSDB_VERSION_MINOR ${CMAKE_MATCH_1})
-string(REGEX MATCH "#define ROCKSDB_PATCH ([0-9]+)" _ ${version_header_file})
-set(ROCKSDB_VERSION_PATCH ${CMAKE_MATCH_1})
-set(ROCKSDB_VERSION ${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH})
-
-
 option(WITH_MD_LIBRARY "build with MD" ON)
 if(WIN32 AND MSVC)
   if(WITH_MD_LIBRARY)
@@ -177,6 +175,7 @@ else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing")
   if(MINGW)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format")
+    add_definitions(-D_POSIX_C_SOURCE=1)
   endif()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
   if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -201,6 +200,15 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
   endif(HAS_ALTIVEC)
 endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
 
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+        CHECK_C_COMPILER_FLAG("-march=armv8-a+crc+crypto" HAS_ARMV8_CRC)
+  if(HAS_ARMV8_CRC)
+    message(STATUS " HAS_ARMV8_CRC yes")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
+  endif(HAS_ARMV8_CRC)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+
 option(PORTABLE "build a portable binary" OFF)
 option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
 if(PORTABLE)
@@ -213,7 +221,7 @@ else()
   if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
   else()
-    if(NOT HAVE_POWER8)
+    if(NOT HAVE_POWER8 AND NOT HAS_ARMV8_CRC)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
     endif()
   endif()
@@ -300,15 +308,14 @@ if(WITH_NUMA)
   find_package(NUMA REQUIRED)
   add_definitions(-DNUMA)
   include_directories(${NUMA_INCLUDE_DIR})
-  list(APPEND THIRDPARTY_LIBS ${NUMA_LIBRARIES})
+  list(APPEND THIRDPARTY_LIBS NUMA::NUMA)
 endif()
 
 option(WITH_TBB "build with Threading Building Blocks (TBB)" OFF)
 if(WITH_TBB)
   find_package(TBB REQUIRED)
   add_definitions(-DTBB)
-  include_directories(${TBB_INCLUDE_DIR})
-  list(APPEND THIRDPARTY_LIBS ${TBB_LIBRARIES})
+  list(APPEND THIRDPARTY_LIBS TBB::TBB)
 endif()
 
 # Stall notifications eat some performance from inserts
@@ -317,6 +324,10 @@ if(DISABLE_STALL_NOTIF)
   add_definitions(-DROCKSDB_DISABLE_STALL_NOTIFICATION)
 endif()
 
+option(WITH_DYNAMIC_EXTENSION "build with dynamic extension support" OFF)
+if(NOT WITH_DYNAMIC_EXTENSION)
+  add_definitions(-DROCKSDB_NO_DYNAMIC_EXTENSION)
+endif()
 
 if(DEFINED USE_RTTI)
   if(USE_RTTI)
@@ -458,38 +469,41 @@ endif()
 
 include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/include)
-include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src)
+include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.8.1/fused-src)
+if(WITH_FOLLY_DISTRIBUTED_MUTEX)
+  include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
+endif()
 find_package(Threads REQUIRED)
 
-add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest)
-
 # Main library source code
 
 set(SOURCES
         cache/clock_cache.cc
         cache/lru_cache.cc
         cache/sharded_cache.cc
+        db/arena_wrapped_db_iter.cc
         db/builder.cc
         db/c.cc
         db/column_family.cc
         db/compacted_db_impl.cc
-        db/compaction.cc
-        db/compaction_iterator.cc
-        db/compaction_job.cc
-        db/compaction_picker.cc
-        db/compaction_picker_fifo.cc
-        db/compaction_picker_universal.cc
+        db/compaction/compaction.cc
+        db/compaction/compaction_iterator.cc
+        db/compaction/compaction_picker.cc
+        db/compaction/compaction_job.cc
+        db/compaction/compaction_picker_fifo.cc
+        db/compaction/compaction_picker_level.cc
+        db/compaction/compaction_picker_universal.cc
         db/convenience.cc
         db/db_filesnapshot.cc
-        db/db_impl.cc
-        db/db_impl_write.cc
-        db/db_impl_compaction_flush.cc
-        db/db_impl_files.cc
-        db/db_impl_open.cc
-        db/db_impl_debug.cc
-        db/db_impl_experimental.cc
-        db/db_impl_readonly.cc
-        db/db_impl_secondary.cc
+        db/db_impl/db_impl.cc
+        db/db_impl/db_impl_write.cc
+        db/db_impl/db_impl_compaction_flush.cc
+        db/db_impl/db_impl_files.cc
+        db/db_impl/db_impl_open.cc
+        db/db_impl/db_impl_debug.cc
+        db/db_impl/db_impl_experimental.cc
+        db/db_impl/db_impl_readonly.cc
+        db/db_impl/db_impl_secondary.cc
         db/db_info_dumper.cc
         db/db_iter.cc
         db/dbformat.cc
@@ -501,8 +515,8 @@ set(SOURCES
         db/flush_job.cc
         db/flush_scheduler.cc
         db/forward_iterator.cc
+        db/import_column_family_job.cc
         db/internal_stats.cc
-        db/in_memory_stats_history.cc
         db/logs_with_prep_tracker.cc
         db/log_reader.cc
         db/log_writer.cc
@@ -518,6 +532,7 @@ set(SOURCES
         db/table_cache.cc
         db/table_properties_collector.cc
         db/transaction_log_impl.cc
+        db/trim_history_scheduler.cc
         db/version_builder.cc
         db/version_edit.cc
         db/version_set.cc
@@ -531,6 +546,22 @@ set(SOURCES
         env/env_encryption.cc
         env/env_hdfs.cc
         env/mock_env.cc
+        file/delete_scheduler.cc
+        file/file_prefetch_buffer.cc
+        file/file_util.cc
+        file/filename.cc
+        file/random_access_file_reader.cc
+        file/read_write_util.cc
+        file/readahead_raf.cc
+        file/sequence_file_reader.cc
+        file/sst_file_manager_impl.cc
+        file/writable_file_writer.cc
+        logging/auto_roll_logger.cc
+        logging/event_logger.cc
+        logging/log_buffer.cc
+        memory/arena.cc
+        memory/concurrent_arena.cc
+        memory/jemalloc_nodump_allocator.cc
         memtable/alloc_tracker.cc
         memtable/hash_linklist_rep.cc
         memtable/hash_skiplist_rep.cc
@@ -539,10 +570,12 @@ set(SOURCES
         memtable/write_buffer_manager.cc
         monitoring/histogram.cc
         monitoring/histogram_windowing.cc
+        monitoring/in_memory_stats_history.cc
         monitoring/instrumented_mutex.cc
         monitoring/iostats_context.cc
         monitoring/perf_context.cc
         monitoring/perf_level.cc
+        monitoring/persistent_stats_history.cc
         monitoring/statistics.cc
         monitoring/thread_status_impl.cc
         monitoring/thread_status_updater.cc
@@ -555,80 +588,73 @@ set(SOURCES
         options/options_parser.cc
         options/options_sanity_check.cc
         port/stack_trace.cc
-        table/adaptive_table_factory.cc
-        table/block.cc
-        table/block_based_filter_block.cc
-        table/block_based_table_builder.cc
-        table/block_based_table_factory.cc
-        table/block_based_table_reader.cc
-        table/block_builder.cc
+        table/adaptive/adaptive_table_factory.cc
+        table/block_based/block.cc
+        table/block_based/block_based_filter_block.cc
+        table/block_based/block_based_table_builder.cc
+        table/block_based/block_based_table_factory.cc
+        table/block_based/block_based_table_reader.cc
+        table/block_based/block_builder.cc
+        table/block_based/block_prefix_index.cc
+        table/block_based/data_block_hash_index.cc
+        table/block_based/data_block_footer.cc
+        table/block_based/filter_block_reader_common.cc
+        table/block_based/filter_policy.cc
+        table/block_based/flush_block_policy.cc
+        table/block_based/full_filter_block.cc
+        table/block_based/index_builder.cc
+        table/block_based/parsed_full_filter_block.cc
+        table/block_based/partitioned_filter_block.cc
+        table/block_based/uncompression_dict_reader.cc
         table/block_fetcher.cc
-        table/block_prefix_index.cc
-        table/bloom_block.cc
-        table/cuckoo_table_builder.cc
-        table/cuckoo_table_factory.cc
-        table/cuckoo_table_reader.cc
-        table/data_block_hash_index.cc
-        table/data_block_footer.cc
-        table/flush_block_policy.cc
+        table/cuckoo/cuckoo_table_builder.cc
+        table/cuckoo/cuckoo_table_factory.cc
+        table/cuckoo/cuckoo_table_reader.cc
         table/format.cc
-        table/full_filter_block.cc
         table/get_context.cc
-        table/index_builder.cc
         table/iterator.cc
         table/merging_iterator.cc
         table/meta_blocks.cc
-        table/partitioned_filter_block.cc
         table/persistent_cache_helper.cc
-        table/plain_table_builder.cc
-        table/plain_table_factory.cc
-        table/plain_table_index.cc
-        table/plain_table_key_coding.cc
-        table/plain_table_reader.cc
+        table/plain/plain_table_bloom.cc
+        table/plain/plain_table_builder.cc
+        table/plain/plain_table_factory.cc
+        table/plain/plain_table_index.cc
+        table/plain/plain_table_key_coding.cc
+        table/plain/plain_table_reader.cc
         table/sst_file_reader.cc
         table/sst_file_writer.cc
         table/table_properties.cc
         table/two_level_iterator.cc
+        test_util/sync_point.cc
+        test_util/sync_point_impl.cc
+        test_util/testutil.cc
+        test_util/transaction_test_util.cc
+        tools/block_cache_analyzer/block_cache_trace_analyzer.cc
         tools/db_bench_tool.cc
         tools/dump/db_dump_tool.cc
         tools/ldb_cmd.cc
         tools/ldb_tool.cc
         tools/sst_dump_tool.cc
         tools/trace_analyzer_tool.cc
-        util/arena.cc
-        util/auto_roll_logger.cc
-        util/bloom.cc
+        trace_replay/trace_replay.cc
+        trace_replay/block_cache_tracer.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
         util/comparator.cc
         util/compression_context_cache.cc
-        util/concurrent_arena.cc
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
-        util/delete_scheduler.cc
         util/dynamic_bloom.cc
-        util/event_logger.cc
-        util/file_reader_writer.cc
-        util/file_util.cc
-        util/filename.cc
-        util/filter_policy.cc
         util/hash.cc
-        util/jemalloc_nodump_allocator.cc
-        util/log_buffer.cc
         util/murmurhash.cc
         util/random.cc
         util/rate_limiter.cc
         util/slice.cc
-        util/sst_file_manager_impl.cc
         util/status.cc
         util/string_util.cc
-        util/sync_point.cc
-        util/sync_point_impl.cc
-        util/testutil.cc
         util/thread_local.cc
         util/threadpool_imp.cc
-        util/trace_replay.cc
-        util/transaction_test_util.cc
         util/xxhash.cc
         utilities/backupable/backupable_db.cc
         utilities/blob_db/blob_compaction_filter.cc
@@ -653,9 +679,11 @@ set(SOURCES
         utilities/merge_operators/bytesxor.cc
         utilities/merge_operators/max.cc
         utilities/merge_operators/put.cc
+        utilities/merge_operators/sortlist.cc
         utilities/merge_operators/string_append/stringappend.cc
         utilities/merge_operators/string_append/stringappend2.cc
         utilities/merge_operators/uint64add.cc
+        utilities/object_registry.cc
         utilities/option_change_migration/option_change_migration.cc
         utilities/options/options_util.cc
         utilities/persistent_cache/block_cache_tier.cc
@@ -663,6 +691,7 @@ set(SOURCES
         utilities/persistent_cache/block_cache_tier_metadata.cc
         utilities/persistent_cache/persistent_cache_tier.cc
         utilities/persistent_cache/volatile_tier_impl.cc
+        utilities/simulator_cache/cache_simulator.cc
         utilities/simulator_cache/sim_cache.cc
         utilities/table_properties_collectors/compact_on_deletion_collector.cc
         utilities/trace/file_trace_reader_writer.cc
@@ -696,6 +725,11 @@ if(HAVE_POWER8)
     util/crc32c_ppc_asm.S)
 endif(HAVE_POWER8)
 
+if(HAS_ARMV8_CRC)
+  list(APPEND SOURCES
+    util/crc32c_arm64.cc)
+endif(HAS_ARMV8_CRC)
+
 if(WIN32)
   list(APPEND SOURCES
     port/win/io_win.cc
@@ -722,6 +756,15 @@ else()
     env/io_posix.cc)
 endif()
 
+if(WITH_FOLLY_DISTRIBUTED_MUTEX)
+  list(APPEND SOURCES
+    third-party/folly/folly/detail/Futex.cpp
+    third-party/folly/folly/synchronization/AtomicNotification.cpp
+    third-party/folly/folly/synchronization/DistributedMutex.cpp
+    third-party/folly/folly/synchronization/ParkingLot.cpp
+    third-party/folly/folly/synchronization/WaitOptions.cpp)
+endif()
+
 set(ROCKSDB_STATIC_LIB rocksdb${ARTIFACT_SUFFIX})
 set(ROCKSDB_SHARED_LIB rocksdb-shared${ARTIFACT_SUFFIX})
 set(ROCKSDB_IMPORT_LIB ${ROCKSDB_SHARED_LIB})
@@ -734,7 +777,7 @@ if(WITH_LIBRADOS)
 endif()
 
 if(WIN32)
-  set(SYSTEM_LIBS ${SYSTEM_LIBS} Shlwapi.lib Rpcrt4.lib)
+  set(SYSTEM_LIBS ${SYSTEM_LIBS} shlwapi.lib rpcrt4.lib)
   set(LIBS ${ROCKSDB_STATIC_LIB} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
 else()
   set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT})
@@ -745,8 +788,8 @@ else()
     ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
   set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES
                         LINKER_LANGUAGE CXX
-                        VERSION ${ROCKSDB_VERSION}
-                        SOVERSION ${ROCKSDB_VERSION_MAJOR}
+                        VERSION ${rocksdb_VERSION}
+                        SOVERSION ${rocksdb_VERSION_MAJOR}
                         CXX_STANDARD 11
                         OUTPUT_NAME "rocksdb")
 endif()
@@ -801,7 +844,7 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS)
 
   write_basic_package_version_file(
     RocksDBConfigVersion.cmake
-    VERSION ${ROCKSDB_VERSION}
+    VERSION ${rocksdb_VERSION}
     COMPATIBILITY SameMajorVersion
   )
 
@@ -843,15 +886,16 @@ endif()
 
 option(WITH_TESTS "build with tests" ON)
 if(WITH_TESTS)
+  add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest)
   set(TESTS
         cache/cache_test.cc
         cache/lru_cache_test.cc
         db/column_family_test.cc
         db/compact_files_test.cc
-        db/compaction_iterator_test.cc
-        db/compaction_job_stats_test.cc
-        db/compaction_job_test.cc
-        db/compaction_picker_test.cc
+        db/compaction/compaction_job_stats_test.cc
+        db/compaction/compaction_job_test.cc
+        db/compaction/compaction_iterator_test.cc
+        db/compaction/compaction_picker_test.cc
         db/comparator_db_test.cc
         db/corruption_test.cc
         db/cuckoo_table_db_test.cc
@@ -871,10 +915,11 @@ if(WITH_TESTS)
         db/db_log_iter_test.cc
         db/db_memtable_test.cc
         db/db_merge_operator_test.cc
+        db/db_merge_operand_test.cc
         db/db_options_test.cc
         db/db_properties_test.cc
         db/db_range_del_test.cc
-        db/db_secondary_test.cc
+        db/db_impl/db_secondary_test.cc
         db/db_sst_test.cc
         db/db_statistics_test.cc
         db/db_table_properties_test.cc
@@ -918,37 +963,41 @@ if(WITH_TESTS)
         env/env_basic_test.cc
         env/env_test.cc
         env/mock_env_test.cc
+        file/delete_scheduler_test.cc
+        logging/auto_roll_logger_test.cc
+        logging/env_logger_test.cc
+        logging/event_logger_test.cc
+        memory/arena_test.cc
         memtable/inlineskiplist_test.cc
         memtable/skiplist_test.cc
         memtable/write_buffer_manager_test.cc
         monitoring/histogram_test.cc
         monitoring/iostats_context_test.cc
         monitoring/statistics_test.cc
+        monitoring/stats_history_test.cc
         options/options_settable_test.cc
         options/options_test.cc
-        table/block_based_filter_block_test.cc
-        table/block_test.cc
+        table/block_based/block_based_filter_block_test.cc
+        table/block_based/block_test.cc
+        table/block_based/data_block_hash_index_test.cc
+        table/block_based/full_filter_block_test.cc
+        table/block_based/partitioned_filter_block_test.cc
         table/cleanable_test.cc
-        table/cuckoo_table_builder_test.cc
-        table/cuckoo_table_reader_test.cc
-        table/data_block_hash_index_test.cc
-        table/full_filter_block_test.cc
+        table/cuckoo/cuckoo_table_builder_test.cc
+        table/cuckoo/cuckoo_table_reader_test.cc
         table/merger_test.cc
         table/sst_file_reader_test.cc
         table/table_test.cc
+        tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
         tools/ldb_cmd_test.cc
         tools/reduce_levels_test.cc
         tools/sst_dump_test.cc
         tools/trace_analyzer_test.cc
-        util/arena_test.cc
-        util/auto_roll_logger_test.cc
         util/autovector_test.cc
         util/bloom_test.cc
         util/coding_test.cc
         util/crc32c_test.cc
-        util/delete_scheduler_test.cc
         util/dynamic_bloom_test.cc
-        util/event_logger_test.cc
         util/file_reader_writer_test.cc
         util/filelock_test.cc
         util/hash_test.cc
@@ -973,6 +1022,7 @@ if(WITH_TESTS)
         utilities/options/options_util_test.cc
         utilities/persistent_cache/hash_table_test.cc
         utilities/persistent_cache/persistent_cache_test.cc
+        utilities/simulator_cache/cache_simulator_test.cc
         utilities/simulator_cache/sim_cache_test.cc
         utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
         utilities/transactions/optimistic_transaction_test.cc
@@ -986,14 +1036,19 @@ if(WITH_TESTS)
     list(APPEND TESTS utilities/env_librados_test.cc)
   endif()
 
+  if(WITH_FOLLY_DISTRIBUTED_MUTEX)
+    list(APPEND TESTS third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp)
+  endif()
+
   set(BENCHMARKS
     cache/cache_bench.cc
     memtable/memtablerep_bench.cc
     db/range_del_aggregator_bench.cc
     tools/db_bench.cc
     table/table_reader_bench.cc
+    util/filter_bench.cc
     utilities/persistent_cache/hash_table_bench.cc)
-  add_library(testharness OBJECT util/testharness.cc)
+  add_library(testharness OBJECT test_util/testharness.cc)
   foreach(sourcefile ${BENCHMARKS})
     get_filename_component(exename ${sourcefile} NAME_WE)
     add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}
@@ -1007,7 +1062,7 @@ if(WITH_TESTS)
       db/db_test_util.cc
       monitoring/thread_status_updater_debug.cc
       table/mock_table.cc
-      util/fault_injection_test_env.cc
+      test_util/fault_injection_test_env.cc
       utilities/cassandra/test_utils.cc
   )
   # test utilities are only build in debug
@@ -1015,6 +1070,7 @@ if(WITH_TESTS)
   add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND})
   set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX})
   add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE})
+  target_link_libraries(${TESTUTILLIB} ${LIBS})
   if(MSVC)
     set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb")
   endif()
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 0a45f9bd5f0..d1abc700d28 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -1,3 +1,77 @@
 # Code of Conduct
 
-Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.facebook.com/codeofconduct) so that you can understand what actions will and will not be tolerated.
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
+
diff --git a/HISTORY.md b/HISTORY.md
index 66dd73965ec..3f1e7c9ae81 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,13 +1,184 @@
 # Rocksdb Change Log
-## Unreleased
+## 6.6.0 (11/25/2019)
+### Bug Fixes
+* Fix data corruption casued by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
+* Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression.
+* Fix a bug in DBIter that is_blob_ state isn't updated when iterating backward using seek.
+* Fix a bug when format_version=3, partitioned fitlers, and prefix search are used in conjunction. The bug could result into Seek::(prefix) returning NotFound for an existing prefix.
+* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strong results when reseek happens with a different iterator upper bound.
+* Fix a bug causing a crash during ingest external file when background compaction cause severe error (file not found).
+* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
+* Fix OnFlushCompleted fired before flush result persisted in MANIFEST when there's concurrent flush job. The bug exists since OnFlushCompleted was introduced in rocksdb 3.8.
+* Fixed an sst_dump crash on some plain table SST files.
+* Fixed a memory leak in some error cases of opening plain table SST files.
+* Fix a bug when a crash happens while calling WriteLevel0TableForRecovery for multiple column families, leading to a column family's log number greater than the first corrutped log number when the DB is being opened in PointInTime recovery mode during next recovery attempt (#5856).
+
+### New Features
+* Universal compaction to support options.periodic_compaction_seconds. A full compaction will be triggered if any file is over the threshold.
+* `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` now expose the file number of SST files as well as the oldest blob file referenced by each SST.
+* A batched MultiGet API (DB::MultiGet()) that supports retrieving keys from multiple column families.
+* Full and partitioned filters in the block-based table use an improved Bloom filter implementation, enabled with format_version 5 (or above) because previous releases cannot read this filter. This replacement is faster and more accurate, especially for high bits per key or millions of keys in a single (full) filter. For example, the new Bloom filter has the same false postive rate at 9.55 bits per key as the old one at 10 bits per key, and a lower false positive rate at 16 bits per key than the old one at 100 bits per key.
+* Added AVX2 instructions to USE_SSE builds to accelerate the new Bloom filter and XXH3-based hash function on compatible x86_64 platforms (Haswell and later, ~2014).
+* Support options.ttl or options.periodic_compaction_seconds with options.max_open_files = -1. File's oldest ancester time and file creation time will be written to manifest. If it is availalbe, this information will be used instead of creation_time and file_creation_time in table properties.
+* Setting options.ttl for universal compaction now has the same meaning as setting periodic_compaction_seconds.
+* SstFileMetaData also returns file creation time and oldest ancester time.
+* The `sst_dump` command line tool `recompress` command now displays how many blocks were compressed and how many were not, in particular how many were not compressed because the compression ratio was not met (12.5% threshold for GoodCompressionRatio), as seen in the `number.block.not_compressed` counter stat since version 6.0.0.
+* The block cache usage is now takes into account the overhead of metadata per each entry. This results into more accurate managment of memory. A side-effect of this feature is that less items are fit into the block cache of the same size, which would result to higher cache miss rates. This can be remedied by increasing the block cache size or passing kDontChargeCacheMetadata to its constuctor to restore the old behavior.
+* When using BlobDB, a mapping is maintained and persisted in the MANIFEST between each SST file and the oldest non-TTL blob file it references.
+* `db_bench` now supports and by default issues non-TTL Puts to BlobDB. TTL Puts can be enabled by specifying a non-zero value for the `blob_db_max_ttl_range` command line parameter explicitly.
+* `sst_dump` now supports printing BlobDB blob indexes in a human-readable format. This can be enabled by specifying the `decode_blob_index` flag on the command line.
+* A number of new information elements are now exposed through the EventListener interface. For flushes, the file numbers of the new SST file and the oldest blob file referenced by the SST are propagated. For compactions, the level, file number, and the oldest blob file referenced are passed to the client for each compaction input and output file.
+
+### Public API Change
+* RocksDB release 4.1 or older will not be able to open DB generated by the new release. 4.2 was released on Feb 23, 2016.
+* TTL Compactions in Level compaction style now initiate successive cascading compactions on a key range so that it reaches the bottom level quickly on TTL expiry. `creation_time` table property for compaction output files is now set to the minimum of the creation times of all compaction inputs.
+* With FIFO compaction style, options.periodic_compaction_seconds will have the same meaning as options.ttl. Whichever stricter will be used. With the default options.periodic_compaction_seconds value with options.ttl's default of 0, RocksDB will give a default of 30 days.
+* Added an API GetCreationTimeOfOldestFile(uint64_t* creation_time) to get the file_creation_time of the oldest SST file in the DB.
+* FilterPolicy now exposes additional API to make it possible to choose filter configurations based on context, such as table level and compaction style. See `LevelAndStyleCustomFilterPolicy` in db_bloom_filter_test.cc. While most existing custom implementations of FilterPolicy should continue to work as before, those wrapping the return of NewBloomFilterPolicy will require overriding new function `GetBuilderWithContext()`, because calling `GetFilterBitsBuilder()` on the FilterPolicy returned by NewBloomFilterPolicy is no longer supported.
+* An unlikely usage of FilterPolicy is no longer supported. Calling GetFilterBitsBuilder() on the FilterPolicy returned by NewBloomFilterPolicy will now cause an assertion violation in debug builds, because RocksDB has internally migrated to a more elaborate interface that is expected to evolve further. Custom implementations of FilterPolicy should work as before, except those wrapping the return of NewBloomFilterPolicy, which will require a new override of a protected function in FilterPolicy.
+* NewBloomFilterPolicy now takes bits_per_key as a double instead of an int. This permits finer control over the memory vs. accuracy trade-off in the new Bloom filter implementation and should not change source code compatibility.
+* The option BackupableDBOptions::max_valid_backups_to_open is now only used when opening BackupEngineReadOnly. When opening a read/write BackupEngine, anything but the default value logs a warning and is treated as the default. This change ensures that backup deletion has proper accounting of shared files to ensure they are deleted when no longer referenced by a backup.
+* Deprecate `snap_refresh_nanos` option.
+* Added DisableManualCompaction/EnableManualCompaction to stop and resume manual compaction.
+* Add TryCatchUpWithPrimary() to StackableDB in non-LITE mode.
+* Add a new Env::LoadEnv() overloaded function to return a shared_ptr to Env.
+* Flush sets file name to "(nil)" for OnTableFileCreationCompleted() if the flush does not produce any L0. This can happen if the file is empty thus delete by RocksDB.
+
+### Default Option Changes
+* Changed the default value of periodic_compaction_seconds to `UINT64_MAX - 1` which allows RocksDB to auto-tune periodic compaction scheduling. When using the default value, periodic compactions are now auto-enabled if a compaction filter is used. A value of `0` will turn off the feature completely.
+* Changed the default value of ttl to `UINT64_MAX - 1` which allows RocksDB to auto-tune ttl value. When using the default value, TTL will be auto-enabled to 30 days, when the feature is supported. To revert the old behavior, you can explictly set it to 0.
+
+### Performance Improvements
+* For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
+* Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom.
+
+## 6.5.2 (11/15/2019)
+### Bug Fixes
+* Fix a assertion failure in MultiGet() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
+* Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured.
+* If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
+
+## 6.5.1 (10/16/2019)
+### Bug Fixes
+* Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound.
+* Fix a bug in BlockBasedTableIterator that might return incorrect results when reseek happens with a different iterator upper bound.
+* Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
+
+## 6.5.0 (9/13/2019)
+### Bug Fixes
+* Fixed a number of data races in BlobDB.
+* Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
+* Fix bloom filter lookups by the MultiGet batching API when BlockBasedTableOptions::whole_key_filtering is false, by checking that a key is in the perfix_extractor domain and extracting the prefix before looking up.
+* Fix a bug in file ingestion caused by incorrect file number allocation when the number of column families involved in the ingestion exceeds 2.
+
+### New Features
+* Introduced DBOptions::max_write_batch_group_size_bytes to configure maximum limit on number of bytes that are written in a single batch of WAL or memtable write. It is followed when the leader write size is larger than 1/8 of this limit.
+* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. For checksum verifying before external SST file ingestion, a new option IngestExternalFileOptions.verify_checksums_readahead_size, is added for this readahead setting.
+* When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process.
+* Add an option `memtable_insert_hint_per_batch` to WriteOptions. If it is true, each WriteBatch will maintain its own insert hints for each memtable in concurrent write. See include/rocksdb/options.h for more details.
+
+### Public API Change
+* Added max_write_buffer_size_to_maintain option to better control memory usage of immutable memtables.
+* Added a lightweight API GetCurrentWalFile() to get last live WAL filename and size. Meant to be used as a helper for backup/restore tooling in a larger ecosystem such as MySQL with a MyRocks storage engine.
+* The MemTable Bloom filter, when enabled, now always uses cache locality. Options::bloom_locality now only affects the PlainTable SST format.
+
+### Performance Improvements
+* Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
+
+## 6.4.0 (7/30/2019)
+### Default Option Change
+* LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explictly created, the small block cache created by BlockBasedTable will still has this option to be 0.0.
+* Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true.
+
+### Public API Change
+* Filter and compression dictionary blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, filter and compression dictionary blocks (as well as filter partitions) no longer get evicted from the cache when a table is closed.
+* Due to the above refactoring, block cache eviction statistics for filter and compression dictionary blocks are temporarily broken. We plan to reintroduce them in a later phase.
+* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
+* Errors related to the retrieval of the compression dictionary are now propagated to the user.
+* db_bench adds a "benchmark" stats_history, which prints out the whole stats history.
+* Overload GetAllKeyVersions() to support non-default column family.
+* Added new APIs ExportColumnFamily() and CreateColumnFamilyWithImport() to support export and import of a Column Family. https://github.com/facebook/rocksdb/issues/3469
+* ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator.
+* Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env.
+* Added new overload of GetApproximateSizes which gets SizeApproximationOptions object and returns a Status. The older overloads are redirecting their calls to this new method and no longer assert if the include_flags doesn't have either of INCLUDE_MEMTABLES or INCLUDE_FILES bits set. It's recommended to use the new method only, as it is more type safe and returns a meaningful status in case of errors.
+* LDBCommandRunner::RunCommand() to return the status code as an integer, rather than call exit() using the code.
+
+### New Features
+* Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact.
+* Compression dictionary blocks are now prefetched and pinned in the cache (based on the customer's settings) the same way as index and filter blocks.
+* Added DBOptions::log_readahead_size which specifies the number of bytes to prefetch when reading the log. This is mostly useful for reading a remotely located log, as it can save the number of round-trips. If 0 (default), then the prefetching is disabled.
+* Added new option in SizeApproximationOptions used with DB::GetApproximateSizes. When approximating the files total size that is used to store a keys range, allow approximation with an error margin of up to total_files_size * files_size_error_margin. This allows to take some shortcuts in files size approximation, resulting in better performance, while guaranteeing the resulting error is within a reasonable margin.
+* Support loading custom objects in unit tests. In the affected unit tests, RocksDB will create custom Env objects based on environment variable TEST_ENV_URI. Users need to make sure custom object types are properly registered. For example, a static library should expose a `RegisterCustomObjects` function. By linking the unit test binary with the static library, the unit test can execute this function.
+
+### Performance Improvements
+* Reduce iterator key comparision for upper/lower bound check.
+* Improve performance of row_cache: make reads with newer snapshots than data in an SST file share the same cache key, except in some transaction cases.
+* The compression dictionary is no longer copied to a new object upon retrieval.
+
+### Bug Fixes
+* Fix ingested file and directory not being fsync.
+* Return TryAgain status in place of Corruption when new tail is not visible to TransactionLogIterator.
+* Fixed a regression where the fill_cache read option also affected index blocks.
+* Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes/filters as well.
+
+## 6.3.2 (8/15/2019)
+### Public API Change
+* The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
+
+### Bug Fixes
+* Fixed a regression where the fill_cache read option also affected index blocks.
+* Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes as well.
+
+## 6.3.1 (7/24/2019)
+### Bug Fixes
+* Fix auto rolling bug introduced in 6.3.0, which causes segfault if log file creation fails.
+
+## 6.3.0 (6/18/2019)
+### Public API Change
+* Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
+* Index blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index blocks no longer get evicted from the cache when a table is closed, can now use the compressed block cache (if any), and can be shared among multiple table readers.
+* Partitions of partitioned indexes no longer affect the read amplification statistics.
+* Due to the above refactoring, block cache eviction statistics for indexes are temporarily broken. We plan to reintroduce them in a later phase.
+* options.keep_log_file_num will be enforced strictly all the time. File names of all log files will be tracked, which may take significantly amount of memory if options.keep_log_file_num is large and either of options.max_log_file_size or options.log_file_time_to_roll is set.
+* Add initial support for Get/Put with user timestamps. Users can specify timestamps via ReadOptions and WriteOptions when calling DB::Get and DB::Put.
+* Accessing a partition of a partitioned filter or index through a pinned reference is no longer considered a cache hit.
+* Add C bindings for secondary instance, i.e. DBImplSecondary.
+* Rate limited deletion of WALs is only enabled if DBOptions::wal_dir is not set, or explicitly set to db_name passed to DB::Open and DBOptions::db_paths is empty, or same as db_paths[0].path
+
+### New Features
+* Add an option `snap_refresh_nanos` (default to 0) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
+* Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees.
+* Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL.
+* Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error.
+* Add command `list_file_range_deletes` in ldb, which prints out tombstones in SST files.
+
+### Performance Improvements
+* Reduce binary search when iterator reseek into the same data block.
+* DBIter::Next() can skip user key checking if previous entry's seqnum is 0.
+* Merging iterator to avoid child iterator reseek for some cases
+* Log Writer will flush after finishing the whole record, rather than a fragment.
+* Lower MultiGet batching API latency by reading data blocks from disk in parallel
+
+### General Improvements
+* Added new status code kColumnFamilyDropped to distinguish between Column Family Dropped and DB Shutdown in progress.
+* Improve ColumnFamilyOptions validation when creating a new column family.
+
+### Bug Fixes
+* Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number.
+* Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level.
+* Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family.
+* Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
+* On DB open, delete WAL trash files left behind in wal_dir
+
+## 6.2.0 (4/30/2019)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
 * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator.
 * Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level.
 * Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior.
 * When reading from option file/string/map, customized envs can be filled according to object registry.
-* Add an option `snap_refresh_nanos` (default to 0.5s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature.
 * Improve range scan performance when using explicit user readahead by not creating new table readers for every iterator.
+* Add index type BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It significantly reduces read amplification in some setups, especially for iterator seeks. It's not fully implemented yet: IO errors are not handled right.
 
 ### Public API Change
 * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering.
@@ -19,6 +190,7 @@
 * Fix a race condition between WritePrepared::Get and ::Put with duplicate keys.
 * Fix crash when memtable prefix bloom is enabled and read/write a key out of domain of prefix extractor.
 * Close a WAL file before another thread deletes it.
+* Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag.
 
 ## 6.1.1 (4/9/2019)
 ### New Features
diff --git a/Makefile b/Makefile
index eee0f9fba02..3b0ddffee51 100644
--- a/Makefile
+++ b/Makefile
@@ -82,17 +82,23 @@ ifeq ($(MAKECMDGOALS),rocksdbjavastatic)
 endif
 
 ifeq ($(MAKECMDGOALS),rocksdbjavastaticrelease)
-	DEBUG_LEVEL=0
+	ifneq ($(DEBUG_LEVEL),2)
+		DEBUG_LEVEL=0
+	endif
 endif
 
 ifeq ($(MAKECMDGOALS),rocksdbjavastaticreleasedocker)
-        DEBUG_LEVEL=0
+	ifneq ($(DEBUG_LEVEL),2)
+		DEBUG_LEVEL=0
+	endif
 endif
 
 ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
 	DEBUG_LEVEL=0
 endif
 
+$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL})
+
 # Lite build flag.
 LITE ?= 0
 ifeq ($(LITE), 0)
@@ -137,6 +143,12 @@ CFLAGS +=  -DHAVE_POWER8
 HAVE_POWER8=1
 endif
 
+ifeq (,$(shell $(CXX) -fsyntax-only -march=armv8-a+crc+crypto -xc /dev/null 2>&1))
+CXXFLAGS += -march=armv8-a+crc+crypto
+CFLAGS += -march=armv8-a+crc+crypto
+ARMCRC_SOURCE=1
+endif
+
 # if we're compiling for release, compile without debug code (-DNDEBUG)
 ifeq ($(DEBUG_LEVEL),0)
 OPT += -DNDEBUG
@@ -244,8 +256,8 @@ endif
 ifdef COMPILE_WITH_TSAN
 	DISABLE_JEMALLOC=1
 	EXEC_LDFLAGS += -fsanitize=thread
-	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC
-	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC
+	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC -DFOLLY_SANITIZE_THREAD
+	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DFOLLY_SANITIZE_THREAD
         # Turn off -pg when enabling TSAN testing, because that induces
         # a link failure.  TODO: find the root cause
 	PROFILING_FLAGS =
@@ -292,9 +304,13 @@ ifndef DISABLE_JEMALLOC
 	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE)
 endif
 
+ifndef USE_FOLLY_DISTRIBUTED_MUTEX
+	USE_FOLLY_DISTRIBUTED_MUTEX=0
+endif
+
 export GTEST_THROW_ON_FAILURE=1
 export GTEST_HAS_EXCEPTIONS=1
-GTEST_DIR = ./third-party/gtest-1.7.0/fused-src
+GTEST_DIR = ./third-party/gtest-1.8.1/fused-src
 # AIX: pre-defined system headers are surrounded by an extern "C" block
 ifeq ($(PLATFORM), OS_AIX)
 	PLATFORM_CCFLAGS += -I$(GTEST_DIR)
@@ -304,6 +320,23 @@ else
 	PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR)
 endif
 
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+	FOLLY_DIR = ./third-party/folly
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -I$(FOLLY_DIR)
+	else
+		PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR)
+	endif
+endif
+
+ifdef TEST_CACHE_LINE_SIZE
+  PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
+  PLATFORM_CXXFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
+endif
+
 # This (the first rule) must depend on "all".
 default: all
 
@@ -390,10 +423,13 @@ endif
 
 LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o)
 MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o)
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+  FOLLYOBJECTS = $(FOLLY_SOURCES:.cpp=.o)
+endif
 
 GTEST = $(GTEST_DIR)/gtest/gtest-all.o
-TESTUTIL = ./util/testutil.o
-TESTHARNESS = ./util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST)
+TESTUTIL = ./test_util/testutil.o
+TESTHARNESS = ./test_util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST)
 VALGRIND_ERROR = 2
 VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
 
@@ -403,6 +439,8 @@ BENCHTOOLOBJECTS = $(BENCH_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
 
 ANALYZETOOLOBJECTS = $(ANALYZER_LIB_SOURCES:.cc=.o)
 
+STRESSTOOLOBJECTS = $(STRESS_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
+
 EXPOBJECTS = $(LIBOBJECTS) $(TESTUTIL)
 
 TESTS = \
@@ -420,6 +458,7 @@ TESTS = \
 	inlineskiplist_test \
 	env_basic_test \
 	env_test \
+	env_logger_test \
 	hash_test \
 	thread_local_test \
 	rate_limiter_test \
@@ -429,10 +468,10 @@ TESTS = \
 	db_block_cache_test \
 	db_test \
 	db_blob_index_test \
-	db_bloom_filter_test \
 	db_iter_test \
 	db_iter_stress_test \
 	db_log_iter_test \
+	db_bloom_filter_test \
 	db_compaction_filter_test \
 	db_compaction_test \
 	db_dynamic_level_test \
@@ -441,6 +480,7 @@ TESTS = \
 	db_iterator_test \
 	db_memtable_test \
 	db_merge_operator_test \
+	db_merge_operand_test \
 	db_options_test \
 	db_range_del_test \
 	db_secondary_test \
@@ -487,6 +527,7 @@ TESTS = \
 	plain_table_db_test \
 	comparator_db_test \
 	external_sst_file_test \
+	import_column_family_test \
 	prefix_test \
 	skiplist_test \
 	write_buffer_manager_test \
@@ -497,6 +538,7 @@ TESTS = \
 	cassandra_serialize_test \
 	ttl_test \
 	backupable_db_test \
+	cache_simulator_test \
 	sim_cache_test \
 	version_edit_test \
 	version_set_test \
@@ -536,6 +578,7 @@ TESTS = \
 	ldb_cmd_test \
 	persistent_cache_test \
 	statistics_test \
+	stats_history_test \
 	lru_cache_test \
 	object_registry_test \
 	repair_test \
@@ -549,9 +592,16 @@ TESTS = \
 	range_del_aggregator_test \
 	sst_file_reader_test \
 	db_secondary_test \
+	block_cache_tracer_test \
+	block_cache_trace_analyzer_test \
+
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+	TESTS += folly_synchronization_distributed_mutex_test
+endif
 
 PARALLEL_TEST = \
 	backupable_db_test \
+	db_bloom_filter_test \
 	db_compaction_filter_test \
 	db_compaction_test \
 	db_merge_operator_test \
@@ -560,7 +610,9 @@ PARALLEL_TEST = \
 	db_universal_compaction_test \
 	db_wal_test \
 	external_sst_file_test \
+	import_column_family_test \
 	fault_injection_test \
+	file_reader_writer_test \
 	inlineskiplist_test \
 	manual_compaction_test \
 	persistent_cache_test \
@@ -593,12 +645,13 @@ TOOLS = \
 	rocksdb_undump \
 	blob_dump \
 	trace_analyzer \
+	block_cache_trace_analyzer \
 
 TEST_LIBS = \
 	librocksdb_env_basic_test.a
 
 # TODO: add back forward_iterator_bench, after making it build in all environemnts.
-BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench persistent_cache_bench range_del_aggregator_bench
+BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench filter_bench persistent_cache_bench range_del_aggregator_bench
 
 # if user didn't config LIBNAME, set the default
 ifeq ($(LIBNAME),)
@@ -611,6 +664,7 @@ endif
 endif
 LIBRARY = ${LIBNAME}.a
 TOOLS_LIBRARY = ${LIBNAME}_tools.a
+STRESS_LIBRARY = ${LIBNAME}_stress.a
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
@@ -700,6 +754,8 @@ static_lib: $(LIBRARY)
 
 shared_lib: $(SHARED)
 
+stress_lib: $(STRESS_LIBRARY)
+
 tools: $(TOOLS)
 
 tools_lib: $(TOOLS_LIBRARY)
@@ -904,7 +960,7 @@ blackbox_crash_test: db_stress
 	python -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_atomic_flush: db_stress
-	python -u tools/db_crashtest.py --enable_atomic_flush blackbox $(CRASH_TEST_EXT_ARGS)
+	python -u tools/db_crashtest.py --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS)
 
 ifeq ($(CRASH_TEST_KILL_ODD),)
   CRASH_TEST_KILL_ODD=888887
@@ -917,7 +973,7 @@ whitebox_crash_test: db_stress
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
 whitebox_crash_test_with_atomic_flush: db_stress
-	python -u tools/db_crashtest.py --enable_atomic_flush whitebox  --random_kill_odd \
+	python -u tools/db_crashtest.py --cf_consistency whitebox  --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
 asan_check:
@@ -1047,14 +1103,23 @@ unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) $(TOOLLIBOBJECTS) unit
 rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc
 	build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc
 
-clean:
-	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(LIBRARY) $(SHARED)
+clean: clean-ext-libraries-all clean-rocks
+
+clean-not-downloaded: clean-ext-libraries-bin clean-rocks
+
+clean-rocks:
+	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(LIBRARY) $(SHARED)
 	rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
 	$(FIND) . -name "*.[oda]" -exec rm -f {} \;
 	$(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
-	rm -rf bzip2* snappy* zlib* lz4* zstd*
 	cd java; $(MAKE) clean
 
+clean-ext-libraries-all:
+	rm -rf bzip2* snappy* zlib* lz4* zstd*
+
+clean-ext-libraries-bin:
+	find . -maxdepth 1 -type d \( -name bzip2\* -or -name snappy\* -or -name zlib\* -or -name lz4\* -or -name zstd\* \) -prune -exec rm -rf {} \;
+
 tags:
 	ctags -R .
 	cscope -b `$(FIND) . -name '*.cc'` `$(FIND) . -name '*.h'` `$(FIND) . -name '*.c'`
@@ -1084,6 +1149,10 @@ $(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_S
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
+$(STRESS_LIBRARY): $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o) $(STRESS_LIB_SOURCES:.cc=.o)
+	$(AM_V_AR)rm -f $@
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
+
 librocksdb_env_basic_test.a: env/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
@@ -1094,6 +1163,14 @@ db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS)
 trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
 	$(AM_LINK)
 
+block_cache_trace_analyzer: tools/block_cache_analyzer/block_cache_trace_analyzer_tool.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
+	$(AM_LINK)
+
+ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
+folly_synchronization_distributed_mutex_test: $(LIBOBJECTS) $(TESTHARNESS) $(FOLLYOBJECTS) third-party/folly/folly/synchronization/test/DistributedMutexTest.o
+	$(AM_LINK)
+endif
+
 cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
@@ -1103,7 +1180,10 @@ persistent_cache_bench: utilities/persistent_cache/persistent_cache_bench.o $(LI
 memtablerep_bench: memtable/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
-db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
+filter_bench: util/filter_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(AM_LINK)
+
+db_stress: tools/db_stress.o $(STRESSTOOLOBJECTS)
 	$(AM_LINK)
 
 write_stress: tools/write_stress.o $(LIBOBJECTS) $(TESTUTIL)
@@ -1115,7 +1195,7 @@ db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
 db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
-arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
+arena_test: memory/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1172,7 +1252,7 @@ histogram_test: monitoring/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
 thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
+corruption_test: db/corruption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1229,6 +1309,9 @@ db_memtable_test: db/db_memtable_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHA
 db_merge_operator_test: db/db_merge_operator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+db_merge_operand_test: db/db_merge_operand_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 db_options_test: db/db_options_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1253,6 +1336,9 @@ external_sst_file_basic_test: db/external_sst_file_basic_test.o db/db_test_util.
 external_sst_file_test: db/external_sst_file_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+import_column_family_test: db/import_column_family_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1301,6 +1387,9 @@ backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TE
 checkpoint_test: utilities/checkpoint/checkpoint_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+cache_simulator_test: utilities/simulator_cache/cache_simulator_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1327,13 +1416,13 @@ write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_i
 flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-compaction_iterator_test: db/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compaction_iterator_test: db/compaction/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-compaction_job_test: db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compaction_job_test: db/compaction/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-compaction_job_stats_test: db/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compaction_job_stats_test: db/compaction/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 compact_on_deletion_collector_test: utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1357,7 +1446,7 @@ fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS)
 rate_limiter_test: util/rate_limiter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-delete_scheduler_test: util/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS)
+delete_scheduler_test: file/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1366,13 +1455,13 @@ filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
 file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+block_based_filter_block_test: table/block_based/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+full_filter_block_test: table/block_based/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-partitioned_filter_block_test: table/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+partitioned_filter_block_test: table/block_based/partitioned_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1384,10 +1473,10 @@ cleanable_test: table/cleanable_test.o $(LIBOBJECTS) $(TESTHARNESS)
 table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+block_test: table/block_based/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-data_block_hash_index_test: table/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+data_block_hash_index_test: table/block_based/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 inlineskiplist_test: memtable/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1405,7 +1494,7 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
 version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
+compaction_picker_test: db/compaction/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1441,10 +1530,10 @@ util_merge_operators_test: utilities/util_merge_operators_test.o $(LIBOBJECTS) $
 options_file_test: db/options_file_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
+deletefile_test: db/deletefile_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-obsolete_files_test: db/obsolete_files_test.o $(LIBOBJECTS) $(TESTHARNESS)
+obsolete_files_test: db/obsolete_files_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS)
@@ -1453,10 +1542,10 @@ rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS)
 rocksdb_undump: tools/dump/rocksdb_undump.o $(LIBOBJECTS)
 	$(AM_LINK)
 
-cuckoo_table_builder_test: table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cuckoo_table_builder_test: table/cuckoo/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
+cuckoo_table_reader_test: table/cuckoo/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1486,7 +1575,7 @@ db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS
 trace_analyzer_test: tools/trace_analyzer_test.o $(LIBOBJECTS) $(ANALYZETOOLOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+event_logger_test: logging/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1507,7 +1596,10 @@ manual_compaction_test: db/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
 filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+auto_roll_logger_test: logging/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+env_logger_test: logging/env_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1552,6 +1644,9 @@ persistent_cache_test: utilities/persistent_cache/persistent_cache_test.o  db/db
 statistics_test: monitoring/statistics_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+stats_history_test: monitoring/stats_history_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 lru_cache_test: cache/lru_cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1573,7 +1668,13 @@ range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test
 sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-db_secondary_test: db/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+db_secondary_test: db/db_impl/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+block_cache_tracer_test: trace_replay/block_cache_tracer_test.o trace_replay/block_cache_tracer.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+block_cache_trace_analyzer_test: tools/block_cache_analyzer/block_cache_trace_analyzer_test.o tools/block_cache_analyzer/block_cache_trace_analyzer.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 #-------------------------------------------------
@@ -1621,7 +1722,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ifeq ($(PLATFORM), OS_SOLARIS)
 	ARCH := $(shell isainfo -b)
 else ifeq ($(PLATFORM), OS_OPENBSD)
-	ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64))
+	ifneq (,$(filter amd64 ppc64 ppc64le arm64 aarch64 sparc64, $(MACHINE)))
 		ARCH := 64
 	else
 		ARCH := 32
@@ -1630,12 +1731,23 @@ else
 	ARCH := $(shell getconf LONG_BIT)
 endif
 
-ifeq (,$(findstring ppc,$(MACHINE)))
-        ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
+ifeq ($(shell ldd /usr/bin/env 2>/dev/null | grep -q musl; echo $$?),0)
+        JNI_LIBC = musl
+# GNU LibC (or glibc) is so pervasive we can assume it is the default
+# else
+#        JNI_LIBC = glibc
+endif
+
+ifneq ($(origin JNI_LIBC), undefined)
+  JNI_LIBC_POSTFIX = -$(JNI_LIBC)
+endif
+
+ifneq (,$(filter ppc% arm64 aarch64 sparc64, $(MACHINE)))
+	ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so
 else
-        ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE).so
+	ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so
 endif
-ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH)$(JNI_LIBC_POSTFIX).jar
 ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
 ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar
 ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar
@@ -1646,15 +1758,15 @@ ZLIB_SHA256 ?= c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1
 ZLIB_DOWNLOAD_BASE ?= http://zlib.net
 BZIP2_VER ?= 1.0.6
 BZIP2_SHA256 ?= a2848f34fcd5d6cf47def00461fcb528a0484d8edef8208d6d2e2909dc61d9cd
-BZIP2_DOWNLOAD_BASE ?= https://web.archive.org/web/20180624184835/http://www.bzip.org
+BZIP2_DOWNLOAD_BASE ?= https://downloads.sourceforge.net/project/bzip2
 SNAPPY_VER ?= 1.1.7
 SNAPPY_SHA256 ?= 3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4
 SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
-LZ4_VER ?= 1.8.3
-LZ4_SHA256 ?= 33af5936ac06536805f9745e0b6d61da606a1f8b4cc5c04dd3cbaca3b9b4fc43
+LZ4_VER ?= 1.9.2
+LZ4_SHA256 ?= 658ba6191fa44c92280d4aa2c271b0f4fbc0e34d249578dd05e50e76d0e5efcc
 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
-ZSTD_VER ?= 1.3.7
-ZSTD_SHA256 ?= 5dd1e90eb16c25425880c8a91327f63de22891ffed082fcc17e5ae84fce0d5fb
+ZSTD_VER ?= 1.4.4
+ZSTD_SHA256 ?= a364f5162c7d1a455cc915e8e3cf5f4bd8b75d09bc0f53965b0c9ca1383c52c8
 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
 CURL_SSL_OPTS ?= --tlsv1
 
@@ -1708,7 +1820,7 @@ endif
 libbz2.a:
 	-rm -rf bzip2-$(BZIP2_VER)
 ifeq (,$(wildcard ./bzip2-$(BZIP2_VER).tar.gz))
-	curl --output bzip2-$(BZIP2_VER).tar.gz -L ${BZIP2_DOWNLOAD_BASE}/$(BZIP2_VER)/bzip2-$(BZIP2_VER).tar.gz
+	curl --output bzip2-$(BZIP2_VER).tar.gz -L ${CURL_SSL_OPTS} ${BZIP2_DOWNLOAD_BASE}/bzip2-$(BZIP2_VER).tar.gz
 endif
 	BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \
@@ -1809,39 +1921,47 @@ rocksdbjavastatic: $(java_static_all_libobjects)
 	cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
 
 rocksdbjavastaticrelease: rocksdbjavastatic
-	cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
+	cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl
 	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
 	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
 
-rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64
+rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl
 	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
 	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
 
 rocksdbjavastaticdockerx86:
 	mkdir -p java/target
-	DOCKER_LINUX_X86_CONTAINER=`docker ps -aqf name=rocksdb_linux_x86-be`; \
-	if [ -z "$$DOCKER_LINUX_X86_CONTAINER" ]; then \
-		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x86-be evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
-	fi
-	docker start -a rocksdb_linux_x86-be
+	docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerx86_64:
 	mkdir -p java/target
-	DOCKER_LINUX_X64_CONTAINER=`docker ps -aqf name=rocksdb_linux_x64-be`; \
-	if [ -z "$$DOCKER_LINUX_X64_CONTAINER" ]; then \
-		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x64-be evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
-	fi
-	docker start -a rocksdb_linux_x64-be
+	docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerppc64le:
 	mkdir -p java/target
-	DOCKER_LINUX_PPC64LE_CONTAINER=`docker ps -aqf name=rocksdb_linux_ppc64le-be`; \
-	if [ -z "$$DOCKER_LINUX_PPC64LE_CONTAINER" ]; then \
-		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_ppc64le-be evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
-	fi
-	docker start -a rocksdb_linux_ppc64le-be
+	docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerarm64v8:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerx86musl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_x86-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerx86_64musl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerppc64lemusl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+
+rocksdbjavastaticdockerarm64v8musl:
+	mkdir -p java/target
+	docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
 
@@ -1852,6 +1972,8 @@ rocksdbjavastaticpublishcentral:
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64-musl.jar -Dclassifier=linux64-musl
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32-musl.jar -Dclassifier=linux32-musl
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-win64.jar -Dclassifier=win64
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
@@ -1959,7 +2081,7 @@ endif
 #  	Source files dependencies detection
 # ---------------------------------------------------------------------------
 
-all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(ANALYZER_LIB_SOURCES)
+all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
 DEPFILES = $(all_sources:.cc=.cc.d)
 
 # Add proper dependency support so changing a .h file forces a .cc file to
diff --git a/README.md b/README.md
index f1bc0c05f3c..9ef21ee57df 100644
--- a/README.md
+++ b/README.md
@@ -9,11 +9,11 @@ It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by S
 and Jeff Dean (jeff@google.com)
 
 This code is a library that forms the core building block for a fast
-key value server, especially suited for storing data on flash drives.
+key-value server, especially suited for storing data on flash drives.
 It has a Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
 between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF)
 and Space-Amplification-Factor (SAF). It has multi-threaded compactions,
-making it specially suitable for storing multiple terabytes of data in a
+making it especially suitable for storing multiple terabytes of data in a
 single database.
 
 Start with example usage here: https://github.com/facebook/rocksdb/tree/master/examples
diff --git a/TARGETS b/TARGETS
index 073c977e5ad..ab1f24cd76c 100644
--- a/TARGETS
+++ b/TARGETS
@@ -1,3 +1,8 @@
+# This file @generated by `python buckifier/buckify_rocksdb.py`
+# --> DO NOT EDIT MANUALLY <--
+# This file is a Facebook-specific integration for buck builds, so can
+# only be validated by Facebook employees.
+#
 load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
 load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
 load(":defs.bzl", "test_binary")
@@ -6,30 +11,11 @@ REPO_PATH = package_name() + "/"
 
 ROCKSDB_COMPILER_FLAGS = [
     "-fno-builtin-memcmp",
-    "-DROCKSDB_PLATFORM_POSIX",
-    "-DROCKSDB_LIB_IO_POSIX",
-    "-DROCKSDB_FALLOCATE_PRESENT",
-    "-DROCKSDB_MALLOC_USABLE_SIZE",
-    "-DROCKSDB_RANGESYNC_PRESENT",
-    "-DROCKSDB_SCHED_GETCPU_PRESENT",
-    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
-    "-DOS_LINUX",
-    # Flags to enable libs we include
-    "-DSNAPPY",
-    "-DZLIB",
-    "-DBZIP2",
-    "-DLZ4",
-    "-DZSTD",
-    "-DZSTD_STATIC_LINKING_ONLY",
-    "-DGFLAGS=gflags",
-    "-DNUMA",
-    "-DTBB",
     # Needed to compile in fbcode
     "-Wno-expansion-to-defined",
     # Added missing flags from output of build_detect_platform
-    "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
-    "-DROCKSDB_BACKTRACE",
     "-Wnarrowing",
+    "-DROCKSDB_NO_DYNAMIC_EXTENSION",
 ]
 
 ROCKSDB_EXTERNAL_DEPS = [
@@ -40,11 +26,54 @@ ROCKSDB_EXTERNAL_DEPS = [
     ("lz4", None, "lz4"),
     ("zstd", None),
     ("tbb", None),
-    ("numa", None, "numa"),
     ("googletest", None, "gtest"),
 ]
 
+ROCKSDB_OS_DEPS = [
+    (
+        "linux",
+        ["third-party//numa:numa"],
+    ),
+]
+
+ROCKSDB_OS_PREPROCESSOR_FLAGS = [
+    (
+        "linux",
+        [
+            "-DOS_LINUX",
+            "-DROCKSDB_FALLOCATE_PRESENT",
+            "-DROCKSDB_MALLOC_USABLE_SIZE",
+            "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
+            "-DROCKSDB_RANGESYNC_PRESENT",
+            "-DROCKSDB_SCHED_GETCPU_PRESENT",
+            "-DHAVE_SSE42",
+            "-DNUMA",
+        ],
+    ),
+    (
+        "macos",
+        ["-DOS_MACOSX"],
+    ),
+]
+
 ROCKSDB_PREPROCESSOR_FLAGS = [
+    "-DROCKSDB_PLATFORM_POSIX",
+    "-DROCKSDB_LIB_IO_POSIX",
+    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
+
+    # Flags to enable libs we include
+    "-DSNAPPY",
+    "-DZLIB",
+    "-DBZIP2",
+    "-DLZ4",
+    "-DZSTD",
+    "-DZSTD_STATIC_LINKING_ONLY",
+    "-DGFLAGS=gflags",
+    "-DTBB",
+
+    # Added missing flags from output of build_detect_platform
+    "-DROCKSDB_BACKTRACE",
+
     # Directories with files for #include
     "-I" + REPO_PATH + "include/",
     "-I" + REPO_PATH,
@@ -52,7 +81,6 @@ ROCKSDB_PREPROCESSOR_FLAGS = [
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
     "x86_64": [
-        "-DHAVE_SSE42",
         "-DHAVE_PCLMUL",
     ],
 }
@@ -69,9 +97,15 @@ sanitizer = read_config("fbcode", "sanitizer")
 
 # Do not enable jemalloc if sanitizer presents. RocksDB will further detect
 # whether the binary is linked with jemalloc at runtime.
-ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else [])
+ROCKSDB_OS_PREPROCESSOR_FLAGS += ([(
+    "linux",
+    ["-DROCKSDB_JEMALLOC"],
+)] if sanitizer == "" else [])
 
-ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else [])
+ROCKSDB_OS_DEPS += ([(
+    "linux",
+    ["third-party//jemalloc:headers"],
+)] if sanitizer == "" else [])
 
 cpp_library(
     name = "rocksdb_lib",
@@ -79,27 +113,29 @@ cpp_library(
         "cache/clock_cache.cc",
         "cache/lru_cache.cc",
         "cache/sharded_cache.cc",
+        "db/arena_wrapped_db_iter.cc",
         "db/builder.cc",
         "db/c.cc",
         "db/column_family.cc",
         "db/compacted_db_impl.cc",
-        "db/compaction.cc",
-        "db/compaction_iterator.cc",
-        "db/compaction_job.cc",
-        "db/compaction_picker.cc",
-        "db/compaction_picker_fifo.cc",
-        "db/compaction_picker_universal.cc",
+        "db/compaction/compaction.cc",
+        "db/compaction/compaction_iterator.cc",
+        "db/compaction/compaction_job.cc",
+        "db/compaction/compaction_picker.cc",
+        "db/compaction/compaction_picker_fifo.cc",
+        "db/compaction/compaction_picker_level.cc",
+        "db/compaction/compaction_picker_universal.cc",
         "db/convenience.cc",
         "db/db_filesnapshot.cc",
-        "db/db_impl.cc",
-        "db/db_impl_compaction_flush.cc",
-        "db/db_impl_debug.cc",
-        "db/db_impl_experimental.cc",
-        "db/db_impl_files.cc",
-        "db/db_impl_open.cc",
-        "db/db_impl_readonly.cc",
-        "db/db_impl_secondary.cc",
-        "db/db_impl_write.cc",
+        "db/db_impl/db_impl.cc",
+        "db/db_impl/db_impl_compaction_flush.cc",
+        "db/db_impl/db_impl_debug.cc",
+        "db/db_impl/db_impl_experimental.cc",
+        "db/db_impl/db_impl_files.cc",
+        "db/db_impl/db_impl_open.cc",
+        "db/db_impl/db_impl_readonly.cc",
+        "db/db_impl/db_impl_secondary.cc",
+        "db/db_impl/db_impl_write.cc",
         "db/db_info_dumper.cc",
         "db/db_iter.cc",
         "db/dbformat.cc",
@@ -111,7 +147,7 @@ cpp_library(
         "db/flush_job.cc",
         "db/flush_scheduler.cc",
         "db/forward_iterator.cc",
-        "db/in_memory_stats_history.cc",
+        "db/import_column_family_job.cc",
         "db/internal_stats.cc",
         "db/log_reader.cc",
         "db/log_writer.cc",
@@ -128,6 +164,7 @@ cpp_library(
         "db/table_cache.cc",
         "db/table_properties_collector.cc",
         "db/transaction_log_impl.cc",
+        "db/trim_history_scheduler.cc",
         "db/version_builder.cc",
         "db/version_edit.cc",
         "db/version_set.cc",
@@ -143,6 +180,22 @@ cpp_library(
         "env/env_posix.cc",
         "env/io_posix.cc",
         "env/mock_env.cc",
+        "file/delete_scheduler.cc",
+        "file/file_prefetch_buffer.cc",
+        "file/file_util.cc",
+        "file/filename.cc",
+        "file/random_access_file_reader.cc",
+        "file/read_write_util.cc",
+        "file/readahead_raf.cc",
+        "file/sequence_file_reader.cc",
+        "file/sst_file_manager_impl.cc",
+        "file/writable_file_writer.cc",
+        "logging/auto_roll_logger.cc",
+        "logging/event_logger.cc",
+        "logging/log_buffer.cc",
+        "memory/arena.cc",
+        "memory/concurrent_arena.cc",
+        "memory/jemalloc_nodump_allocator.cc",
         "memtable/alloc_tracker.cc",
         "memtable/hash_linklist_rep.cc",
         "memtable/hash_skiplist_rep.cc",
@@ -151,10 +204,12 @@ cpp_library(
         "memtable/write_buffer_manager.cc",
         "monitoring/histogram.cc",
         "monitoring/histogram_windowing.cc",
+        "monitoring/in_memory_stats_history.cc",
         "monitoring/instrumented_mutex.cc",
         "monitoring/iostats_context.cc",
         "monitoring/perf_context.cc",
         "monitoring/perf_level.cc",
+        "monitoring/persistent_stats_history.cc",
         "monitoring/statistics.cc",
         "monitoring/thread_status_impl.cc",
         "monitoring/thread_status_updater.cc",
@@ -169,78 +224,70 @@ cpp_library(
         "options/options_sanity_check.cc",
         "port/port_posix.cc",
         "port/stack_trace.cc",
-        "table/adaptive_table_factory.cc",
-        "table/block.cc",
-        "table/block_based_filter_block.cc",
-        "table/block_based_table_builder.cc",
-        "table/block_based_table_factory.cc",
-        "table/block_based_table_reader.cc",
-        "table/block_builder.cc",
+        "table/adaptive/adaptive_table_factory.cc",
+        "table/block_based/block.cc",
+        "table/block_based/block_based_filter_block.cc",
+        "table/block_based/block_based_table_builder.cc",
+        "table/block_based/block_based_table_factory.cc",
+        "table/block_based/block_based_table_reader.cc",
+        "table/block_based/block_builder.cc",
+        "table/block_based/block_prefix_index.cc",
+        "table/block_based/data_block_footer.cc",
+        "table/block_based/data_block_hash_index.cc",
+        "table/block_based/filter_block_reader_common.cc",
+        "table/block_based/filter_policy.cc",
+        "table/block_based/flush_block_policy.cc",
+        "table/block_based/full_filter_block.cc",
+        "table/block_based/index_builder.cc",
+        "table/block_based/parsed_full_filter_block.cc",
+        "table/block_based/partitioned_filter_block.cc",
+        "table/block_based/uncompression_dict_reader.cc",
         "table/block_fetcher.cc",
-        "table/block_prefix_index.cc",
-        "table/bloom_block.cc",
-        "table/cuckoo_table_builder.cc",
-        "table/cuckoo_table_factory.cc",
-        "table/cuckoo_table_reader.cc",
-        "table/data_block_footer.cc",
-        "table/data_block_hash_index.cc",
-        "table/flush_block_policy.cc",
+        "table/cuckoo/cuckoo_table_builder.cc",
+        "table/cuckoo/cuckoo_table_factory.cc",
+        "table/cuckoo/cuckoo_table_reader.cc",
         "table/format.cc",
-        "table/full_filter_block.cc",
         "table/get_context.cc",
-        "table/index_builder.cc",
         "table/iterator.cc",
         "table/merging_iterator.cc",
         "table/meta_blocks.cc",
-        "table/partitioned_filter_block.cc",
         "table/persistent_cache_helper.cc",
-        "table/plain_table_builder.cc",
-        "table/plain_table_factory.cc",
-        "table/plain_table_index.cc",
-        "table/plain_table_key_coding.cc",
-        "table/plain_table_reader.cc",
+        "table/plain/plain_table_bloom.cc",
+        "table/plain/plain_table_builder.cc",
+        "table/plain/plain_table_factory.cc",
+        "table/plain/plain_table_index.cc",
+        "table/plain/plain_table_key_coding.cc",
+        "table/plain/plain_table_reader.cc",
         "table/sst_file_reader.cc",
         "table/sst_file_writer.cc",
         "table/table_properties.cc",
         "table/two_level_iterator.cc",
+        "test_util/sync_point.cc",
+        "test_util/sync_point_impl.cc",
+        "test_util/transaction_test_util.cc",
         "tools/dump/db_dump_tool.cc",
         "tools/ldb_cmd.cc",
         "tools/ldb_tool.cc",
         "tools/sst_dump_tool.cc",
-        "util/arena.cc",
-        "util/auto_roll_logger.cc",
-        "util/bloom.cc",
+        "trace_replay/block_cache_tracer.cc",
+        "trace_replay/trace_replay.cc",
         "util/build_version.cc",
         "util/coding.cc",
         "util/compaction_job_stats_impl.cc",
         "util/comparator.cc",
         "util/compression_context_cache.cc",
-        "util/concurrent_arena.cc",
         "util/concurrent_task_limiter_impl.cc",
         "util/crc32c.cc",
-        "util/delete_scheduler.cc",
         "util/dynamic_bloom.cc",
-        "util/event_logger.cc",
-        "util/file_reader_writer.cc",
-        "util/file_util.cc",
-        "util/filename.cc",
-        "util/filter_policy.cc",
         "util/hash.cc",
-        "util/jemalloc_nodump_allocator.cc",
-        "util/log_buffer.cc",
         "util/murmurhash.cc",
         "util/random.cc",
         "util/rate_limiter.cc",
         "util/slice.cc",
-        "util/sst_file_manager_impl.cc",
         "util/status.cc",
         "util/string_util.cc",
-        "util/sync_point.cc",
-        "util/sync_point_impl.cc",
         "util/thread_local.cc",
         "util/threadpool_imp.cc",
-        "util/trace_replay.cc",
-        "util/transaction_test_util.cc",
         "util/xxhash.cc",
         "utilities/backupable/backupable_db.cc",
         "utilities/blob_db/blob_compaction_filter.cc",
@@ -266,9 +313,11 @@ cpp_library(
         "utilities/merge_operators/bytesxor.cc",
         "utilities/merge_operators/max.cc",
         "utilities/merge_operators/put.cc",
+        "utilities/merge_operators/sortlist.cc",
         "utilities/merge_operators/string_append/stringappend.cc",
         "utilities/merge_operators/string_append/stringappend2.cc",
         "utilities/merge_operators/uint64add.cc",
+        "utilities/object_registry.cc",
         "utilities/option_change_migration/option_change_migration.cc",
         "utilities/options/options_util.cc",
         "utilities/persistent_cache/block_cache_tier.cc",
@@ -276,6 +325,7 @@ cpp_library(
         "utilities/persistent_cache/block_cache_tier_metadata.cc",
         "utilities/persistent_cache/persistent_cache_tier.cc",
         "utilities/persistent_cache/volatile_tier_impl.cc",
+        "utilities/simulator_cache/cache_simulator.cc",
         "utilities/simulator_cache/sim_cache.cc",
         "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
         "utilities/trace/file_trace_reader_writer.cc",
@@ -299,6 +349,8 @@ cpp_library(
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
@@ -309,15 +361,18 @@ cpp_library(
     srcs = [
         "db/db_test_util.cc",
         "table/mock_table.cc",
+        "test_util/fault_injection_test_env.cc",
+        "test_util/testharness.cc",
+        "test_util/testutil.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
         "tools/trace_analyzer_tool.cc",
-        "util/fault_injection_test_env.cc",
-        "util/testharness.cc",
-        "util/testutil.cc",
         "utilities/cassandra/test_utils.cc",
     ],
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [":rocksdb_lib"],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
@@ -326,13 +381,34 @@ cpp_library(
 cpp_library(
     name = "rocksdb_tools_lib",
     srcs = [
+        "test_util/testutil.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
         "tools/db_bench_tool.cc",
         "tools/trace_analyzer_tool.cc",
-        "util/testutil.cc",
     ],
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    deps = [":rocksdb_lib"],
+    external_deps = ROCKSDB_EXTERNAL_DEPS,
+)
+
+cpp_library(
+    name = "rocksdb_stress_lib",
+    srcs = [
+        "test_util/testutil.cc",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
+        "tools/db_stress_tool.cc",
+        "tools/trace_analyzer_tool.cc",
+    ],
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [":rocksdb_lib"],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
@@ -344,722 +420,1057 @@ cpp_library(
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [":rocksdb_test_lib"],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
-# [test_name, test_src, test_type]
+# [test_name, test_src, test_type, extra_deps, extra_compiler_flags]
 ROCKS_TESTS = [
     [
         "arena_test",
-        "util/arena_test.cc",
+        "memory/arena_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "auto_roll_logger_test",
-        "util/auto_roll_logger_test.cc",
+        "logging/auto_roll_logger_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "autovector_test",
         "util/autovector_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "backupable_db_test",
         "utilities/backupable/backupable_db_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "blob_db_test",
         "utilities/blob_db/blob_db_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "block_based_filter_block_test",
-        "table/block_based_filter_block_test.cc",
+        "table/block_based/block_based_filter_block_test.cc",
         "serial",
+        [],
+        [],
+    ],
+    [
+        "block_cache_trace_analyzer_test",
+        "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc",
+        "serial",
+        [],
+        [],
+    ],
+    [
+        "block_cache_tracer_test",
+        "trace_replay/block_cache_tracer_test.cc",
+        "serial",
+        [],
+        [],
     ],
     [
         "block_test",
-        "table/block_test.cc",
+        "table/block_based/block_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "bloom_test",
         "util/bloom_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "c_test",
         "db/c_test.c",
         "serial",
+        [],
+        [],
+    ],
+    [
+        "cache_simulator_test",
+        "utilities/simulator_cache/cache_simulator_test.cc",
+        "serial",
+        [],
+        [],
     ],
     [
         "cache_test",
         "cache/cache_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cassandra_format_test",
         "utilities/cassandra/cassandra_format_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cassandra_functional_test",
         "utilities/cassandra/cassandra_functional_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cassandra_row_merge_test",
         "utilities/cassandra/cassandra_row_merge_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cassandra_serialize_test",
         "utilities/cassandra/cassandra_serialize_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "checkpoint_test",
         "utilities/checkpoint/checkpoint_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cleanable_test",
         "table/cleanable_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "coding_test",
         "util/coding_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "column_family_test",
         "db/column_family_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compact_files_test",
         "db/compact_files_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compact_on_deletion_collector_test",
         "utilities/table_properties_collectors/compact_on_deletion_collector_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compaction_iterator_test",
-        "db/compaction_iterator_test.cc",
+        "db/compaction/compaction_iterator_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compaction_job_stats_test",
-        "db/compaction_job_stats_test.cc",
+        "db/compaction/compaction_job_stats_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compaction_job_test",
-        "db/compaction_job_test.cc",
+        "db/compaction/compaction_job_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "compaction_picker_test",
-        "db/compaction_picker_test.cc",
+        "db/compaction/compaction_picker_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "comparator_db_test",
         "db/comparator_db_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "corruption_test",
         "db/corruption_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "crc32c_test",
         "util/crc32c_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cuckoo_table_builder_test",
-        "table/cuckoo_table_builder_test.cc",
+        "table/cuckoo/cuckoo_table_builder_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cuckoo_table_db_test",
         "db/cuckoo_table_db_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "cuckoo_table_reader_test",
-        "table/cuckoo_table_reader_test.cc",
+        "table/cuckoo/cuckoo_table_reader_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "data_block_hash_index_test",
-        "table/data_block_hash_index_test.cc",
+        "table/block_based/data_block_hash_index_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_basic_test",
         "db/db_basic_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_blob_index_test",
         "db/db_blob_index_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_block_cache_test",
         "db/db_block_cache_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_bloom_filter_test",
         "db/db_bloom_filter_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
     ],
     [
         "db_compaction_filter_test",
         "db/db_compaction_filter_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_compaction_test",
         "db/db_compaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_dynamic_level_test",
         "db/db_dynamic_level_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_encryption_test",
         "db/db_encryption_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_flush_test",
         "db/db_flush_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_inplace_update_test",
         "db/db_inplace_update_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_io_failure_test",
         "db/db_io_failure_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_iter_stress_test",
         "db/db_iter_stress_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_iter_test",
         "db/db_iter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_iterator_test",
         "db/db_iterator_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_log_iter_test",
         "db/db_log_iter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_memtable_test",
         "db/db_memtable_test.cc",
         "serial",
+        [],
+        [],
+    ],
+    [
+        "db_merge_operand_test",
+        "db/db_merge_operand_test.cc",
+        "serial",
+        [],
+        [],
     ],
     [
         "db_merge_operator_test",
         "db/db_merge_operator_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_options_test",
         "db/db_options_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_properties_test",
         "db/db_properties_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_range_del_test",
         "db/db_range_del_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_secondary_test",
-        "db/db_secondary_test.cc",
+        "db/db_impl/db_secondary_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_sst_test",
         "db/db_sst_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_statistics_test",
         "db/db_statistics_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_table_properties_test",
         "db/db_table_properties_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_tailing_iter_test",
         "db/db_tailing_iter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_test",
         "db/db_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_test2",
         "db/db_test2.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "db_universal_compaction_test",
         "db/db_universal_compaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_wal_test",
         "db/db_wal_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "db_write_test",
         "db/db_write_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "dbformat_test",
         "db/dbformat_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "delete_scheduler_test",
-        "util/delete_scheduler_test.cc",
+        "file/delete_scheduler_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "deletefile_test",
         "db/deletefile_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "dynamic_bloom_test",
         "util/dynamic_bloom_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "env_basic_test",
         "env/env_basic_test.cc",
         "serial",
+        [],
+        [],
+    ],
+    [
+        "env_logger_test",
+        "logging/env_logger_test.cc",
+        "serial",
+        [],
+        [],
     ],
     [
         "env_test",
         "env/env_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "env_timed_test",
         "utilities/env_timed_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "error_handler_test",
         "db/error_handler_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "event_logger_test",
-        "util/event_logger_test.cc",
+        "logging/event_logger_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "external_sst_file_basic_test",
         "db/external_sst_file_basic_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "external_sst_file_test",
         "db/external_sst_file_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "fault_injection_test",
         "db/fault_injection_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "file_indexer_test",
         "db/file_indexer_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "file_reader_writer_test",
         "util/file_reader_writer_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
     ],
     [
         "filelock_test",
         "util/filelock_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "filename_test",
         "db/filename_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "flush_job_test",
         "db/flush_job_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "full_filter_block_test",
-        "table/full_filter_block_test.cc",
+        "table/block_based/full_filter_block_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "hash_table_test",
         "utilities/persistent_cache/hash_table_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "hash_test",
         "util/hash_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "heap_test",
         "util/heap_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "histogram_test",
         "monitoring/histogram_test.cc",
         "serial",
+        [],
+        [],
+    ],
+    [
+        "import_column_family_test",
+        "db/import_column_family_test.cc",
+        "parallel",
+        [],
+        [],
     ],
     [
         "inlineskiplist_test",
         "memtable/inlineskiplist_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "iostats_context_test",
         "monitoring/iostats_context_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "ldb_cmd_test",
         "tools/ldb_cmd_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "listener_test",
         "db/listener_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "log_test",
         "db/log_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "lru_cache_test",
         "cache/lru_cache_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "manual_compaction_test",
         "db/manual_compaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "memory_test",
         "utilities/memory/memory_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "memtable_list_test",
         "db/memtable_list_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "merge_helper_test",
         "db/merge_helper_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "merge_test",
         "db/merge_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "merger_test",
         "table/merger_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "mock_env_test",
         "env/mock_env_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "object_registry_test",
         "utilities/object_registry_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "obsolete_files_test",
         "db/obsolete_files_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "optimistic_transaction_test",
         "utilities/transactions/optimistic_transaction_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "option_change_migration_test",
         "utilities/option_change_migration/option_change_migration_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "options_file_test",
         "db/options_file_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "options_settable_test",
         "options/options_settable_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "options_test",
         "options/options_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "options_util_test",
         "utilities/options/options_util_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "partitioned_filter_block_test",
-        "table/partitioned_filter_block_test.cc",
+        "table/block_based/partitioned_filter_block_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "perf_context_test",
         "db/perf_context_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "persistent_cache_test",
         "utilities/persistent_cache/persistent_cache_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "plain_table_db_test",
         "db/plain_table_db_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "prefix_test",
         "db/prefix_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "range_del_aggregator_test",
         "db/range_del_aggregator_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "range_tombstone_fragmenter_test",
         "db/range_tombstone_fragmenter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "rate_limiter_test",
         "util/rate_limiter_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "reduce_levels_test",
         "tools/reduce_levels_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "repair_test",
         "db/repair_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "repeatable_thread_test",
         "util/repeatable_thread_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "sim_cache_test",
         "utilities/simulator_cache/sim_cache_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "skiplist_test",
         "memtable/skiplist_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "slice_transform_test",
         "util/slice_transform_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "sst_dump_test",
         "tools/sst_dump_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "sst_file_reader_test",
         "table/sst_file_reader_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "statistics_test",
         "monitoring/statistics_test.cc",
         "serial",
+        [],
+        [],
+    ],
+    [
+        "stats_history_test",
+        "monitoring/stats_history_test.cc",
+        "serial",
+        [],
+        [],
     ],
     [
         "stringappend_test",
         "utilities/merge_operators/string_append/stringappend_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "table_properties_collector_test",
         "db/table_properties_collector_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "table_test",
         "table/table_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "thread_list_test",
         "util/thread_list_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "thread_local_test",
         "util/thread_local_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "timer_queue_test",
         "util/timer_queue_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "trace_analyzer_test",
         "tools/trace_analyzer_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "transaction_test",
         "utilities/transactions/transaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "ttl_test",
         "utilities/ttl/ttl_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "util_merge_operators_test",
         "utilities/util_merge_operators_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "version_builder_test",
         "db/version_builder_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "version_edit_test",
         "db/version_edit_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "version_set_test",
         "db/version_set_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "wal_manager_test",
         "db/wal_manager_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_batch_test",
         "db/write_batch_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_batch_with_index_test",
         "utilities/write_batch_with_index/write_batch_with_index_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_buffer_manager_test",
         "memtable/write_buffer_manager_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_callback_test",
         "db/write_callback_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_controller_test",
         "db/write_controller_test.cc",
         "serial",
+        [],
+        [],
     ],
     [
         "write_prepared_transaction_test",
         "utilities/transactions/write_prepared_transaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
     [
         "write_unprepared_transaction_test",
         "utilities/transactions/write_unprepared_transaction_test.cc",
         "parallel",
+        [],
+        [],
     ],
 ]
 
@@ -1068,14 +1479,18 @@ ROCKS_TESTS = [
 # will not be included.
 [
     test_binary(
+        extra_compiler_flags = extra_compiler_flags,
+        extra_deps = extra_deps,
         parallelism = parallelism,
         rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
         rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
         rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS,
+        rocksdb_os_deps = ROCKSDB_OS_DEPS,
+        rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
         rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
         test_cc = test_cc,
         test_name = test_name,
     )
-    for test_name, test_cc, parallelism in ROCKS_TESTS
+    for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS
     if not is_opt_mode
 ]
diff --git a/USERS.md b/USERS.md
index a95903f0662..0fc030339d8 100644
--- a/USERS.md
+++ b/USERS.md
@@ -50,6 +50,10 @@ Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santande
 ## Airbnb
 Airbnb is using RocksDB as a storage engine for their personalized search service. You can learn more about it here: https://www.youtube.com/watch?v=ASQ6XMtogMs
 
+## Alluxio
+[Alluxio](https://www.alluxio.io) uses RocksDB to serve and scale file system metadata to beyond 1 Billion files. The detailed design and implementation is described in this engineering blog:
+https://www.alluxio.io/blog/scalable-metadata-service-in-alluxio-storing-billions-of-files/
+
 ## Pinterest
 Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtube.com/watch?v=MtFEVEs_2Vo
 
@@ -91,4 +95,10 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed
 [ProfaneDB](https://profanedb.gitlab.io/) is a database for Protocol Buffers, and uses RocksDB for storage. It is accessible via gRPC, and the schema is defined using directly `.proto` files.
 
 ## IOTA Foundation
- [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
\ No newline at end of file
+ [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
+ 
+## Avrio Project
+ [Avrio Project](http://avrio-project.github.io/avrio.network/) is using RocksDB in [Avrio ](https://github.com/avrio-project/avrio) to store blocks, account balances and data and other blockchain-releated data. Avrio is a multiblockchain decentralized cryptocurrency empowering monetary transactions.
+ 
+## Crux
+[Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability.
diff --git a/appveyor.yml b/appveyor.yml
index 9dae40af8f7..77901c40724 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,15 +1,67 @@
 version: 1.0.{build}
+
 image: Visual Studio 2017
+
+environment:
+  JAVA_HOME: C:\Program Files\Java\jdk1.8.0
+  THIRDPARTY_HOME: $(APPVEYOR_BUILD_FOLDER)\thirdparty
+  SNAPPY_HOME: $(THIRDPARTY_HOME)\snappy-1.1.7
+  SNAPPY_INCLUDE: $(SNAPPY_HOME);$(SNAPPY_HOME)\build
+  SNAPPY_LIB_DEBUG: $(SNAPPY_HOME)\build\Debug\snappy.lib
+  SNAPPY_LIB_RELEASE: $(SNAPPY_HOME)\build\Release\snappy.lib
+  LZ4_HOME: $(THIRDPARTY_HOME)\lz4-1.8.3
+  LZ4_INCLUDE: $(LZ4_HOME)\lib
+  LZ4_LIB_DEBUG: $(LZ4_HOME)\visual\VS2010\bin\x64_Debug\liblz4_static.lib
+  LZ4_LIB_RELEASE: $(LZ4_HOME)\visual\VS2010\bin\x64_Release\liblz4_static.lib
+  ZSTD_HOME: $(THIRDPARTY_HOME)\zstd-1.4.0
+  ZSTD_INCLUDE: $(ZSTD_HOME)\lib;$(ZSTD_HOME)\lib\dictBuilder
+  ZSTD_LIB_DEBUG: $(ZSTD_HOME)\build\VS2010\bin\x64_Debug\libzstd_static.lib
+  ZSTD_LIB_RELEASE: $(ZSTD_HOME)\build\VS2010\bin\x64_Release\libzstd_static.lib
+
+install:
+  - md %THIRDPARTY_HOME%
+  - echo "Building Snappy dependency..."
+  - cd %THIRDPARTY_HOME%
+  - curl -fsSL -o snappy-1.1.7.zip https://github.com/google/snappy/archive/1.1.7.zip
+  - unzip snappy-1.1.7.zip
+  - cd snappy-1.1.7
+  - mkdir build
+  - cd build
+  - cmake -DCMAKE_GENERATOR_PLATFORM=x64 ..
+  - msbuild Snappy.sln /p:Configuration=Debug /p:Platform=x64
+  - msbuild Snappy.sln /p:Configuration=Release /p:Platform=x64
+  - echo "Building LZ4 dependency..."
+  - cd %THIRDPARTY_HOME%
+  - curl -fsSL -o lz4-1.8.3.zip https://github.com/lz4/lz4/archive/v1.8.3.zip
+  - unzip lz4-1.8.3.zip
+  - cd lz4-1.8.3\visual\VS2010
+  - ps: $CMD="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com"; & $CMD lz4.sln /upgrade
+  - msbuild lz4.sln /p:Configuration=Debug /p:Platform=x64
+  - msbuild lz4.sln /p:Configuration=Release /p:Platform=x64
+  - echo "Building ZStd dependency..."
+  - cd %THIRDPARTY_HOME%
+  - curl -fsSL -o zstd-1.4.0.zip https://github.com/facebook/zstd/archive/v1.4.0.zip
+  - unzip zstd-1.4.0.zip
+  - cd zstd-1.4.0\build\VS2010
+  - ps: $CMD="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\IDE\devenv.com"; & $CMD zstd.sln /upgrade
+  - msbuild zstd.sln /p:Configuration=Debug /p:Platform=x64
+  - msbuild zstd.sln /p:Configuration=Release /p:Platform=x64
+
 before_build:
-- md %APPVEYOR_BUILD_FOLDER%\build
-- cd %APPVEYOR_BUILD_FOLDER%\build
-- cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DWITH_XPRESS=1 -DPORTABLE=1 -DJNI=1 ..
-- cd ..
+  - md %APPVEYOR_BUILD_FOLDER%\build
+  - cd %APPVEYOR_BUILD_FOLDER%\build
+  - cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DLZ4=1 -DZSTD=1 -DXPRESS=1 -DJNI=1 ..
+  - cd ..
 build:
   project: build\rocksdb.sln
   parallel: true
   verbosity: normal
+
 test:
+
 test_script:
-- ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8
+  - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test,db_merge_operand_test -Concurrency 8
+
+on_failure:
+  - cmd: 7z a build-failed.zip %APPVEYOR_BUILD_FOLDER%\build\ && appveyor PushArtifact build-failed.zip
 
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index a5d71b65d4e..d2bba5940cc 100644
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -3,13 +3,36 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+try:
+    from builtins import str
+except ImportError:
+    from __builtin__ import str
 from targets_builder import TARGETSBuilder
+import json
 import os
 import fnmatch
 import sys
 
 from util import ColorString
 
+# This script generates TARGETS file for Buck.
+# Buck is a build tool specifying dependencies among different build targets.
+# User can pass extra dependencies as a JSON object via command line, and this
+# script can include these dependencies in the generate TARGETS file.
+# Usage:
+# $python buckifier/buckify_rocksdb.py
+# (This generates a TARGET file without user-specified dependency for unit
+# tests.)
+# $python buckifier/buckify_rocksdb.py \
+#        '{"fake": { \
+#                      "extra_deps": [":test_dep", "//fakes/module:mock1"],  \
+#                      "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"], \
+#                  } \
+#         }'
+# (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB
+# unit tests, and will use the extra_compiler_flags to compile the unit test
+# source.)
+
 # tests to export as libraries for inclusion in other projects
 _EXPORTED_TEST_LIBS = ["env_basic_test"]
 
@@ -86,8 +109,32 @@ def get_tests(repo_path):
     return tests
 
 
+# Parse extra dependencies passed by user from command line
+def get_dependencies():
+    deps_map = {
+        '': {
+            'extra_deps': [],
+            'extra_compiler_flags': []
+        }
+    }
+    if len(sys.argv) < 2:
+        return deps_map
+
+    def encode_dict(data):
+        rv = {}
+        for k, v in data.items():
+            if isinstance(v, dict):
+                v = encode_dict(v)
+            rv[k] = v
+        return rv
+    extra_deps = json.loads(sys.argv[1], object_hook=encode_dict)
+    for target_alias, deps in extra_deps.items():
+        deps_map[target_alias] = deps
+    return deps_map
+
+
 # Prepare TARGETS file for buck
-def generate_targets(repo_path):
+def generate_targets(repo_path, deps_map):
     print(ColorString.info("Generating TARGETS"))
     # parsed src.mk file
     src_mk = parse_src_mk(repo_path)
@@ -118,27 +165,43 @@ def generate_targets(repo_path):
         "rocksdb_tools_lib",
         src_mk.get("BENCH_LIB_SOURCES", []) +
         src_mk.get("ANALYZER_LIB_SOURCES", []) +
-        ["util/testutil.cc"],
+        ["test_util/testutil.cc"],
+        [":rocksdb_lib"])
+    # rocksdb_stress_lib
+    TARGETS.add_library(
+        "rocksdb_stress_lib",
+        src_mk.get("ANALYZER_LIB_SOURCES", [])
+        + src_mk.get('STRESS_LIB_SOURCES', [])
+        + ["test_util/testutil.cc"],
         [":rocksdb_lib"])
 
+    print("Extra dependencies:\n{0}".format(str(deps_map)))
     # test for every test we found in the Makefile
-    for test in sorted(tests):
-        match_src = [src for src in cc_files if ("/%s.c" % test) in src]
-        if len(match_src) == 0:
-            print(ColorString.warning("Cannot find .cc file for %s" % test))
-            continue
-        elif len(match_src) > 1:
-            print(ColorString.warning("Found more than one .cc for %s" % test))
-            print(match_src)
-            continue
-
-        assert(len(match_src) == 1)
-        is_parallel = tests[test]
-        TARGETS.register_test(test, match_src[0], is_parallel)
-
-        if test in _EXPORTED_TEST_LIBS:
-            test_library = "%s_lib" % test
-            TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"])
+    for target_alias, deps in deps_map.items():
+        for test in sorted(tests):
+            match_src = [src for src in cc_files if ("/%s.c" % test) in src]
+            if len(match_src) == 0:
+                print(ColorString.warning("Cannot find .cc file for %s" % test))
+                continue
+            elif len(match_src) > 1:
+                print(ColorString.warning("Found more than one .cc for %s" % test))
+                print(match_src)
+                continue
+
+            assert(len(match_src) == 1)
+            is_parallel = tests[test]
+            test_target_name = \
+                test if not target_alias else test + "_" + target_alias
+            TARGETS.register_test(
+                test_target_name,
+                match_src[0],
+                is_parallel,
+                deps['extra_deps'],
+                deps['extra_compiler_flags'])
+
+            if test in _EXPORTED_TEST_LIBS:
+                test_library = "%s_lib" % test_target_name
+                TARGETS.add_library(test_library, match_src, [":rocksdb_test_lib"])
     TARGETS.flush_tests()
 
     print(ColorString.info("Generated TARGETS Summary:"))
@@ -163,8 +226,9 @@ def exit_with_error(msg):
 
 
 def main():
+    deps_map = get_dependencies()
     # Generate TARGETS file for buck
-    ok = generate_targets(get_rocksdb_path())
+    ok = generate_targets(get_rocksdb_path(), deps_map)
     if not ok:
         exit_with_error("Failed to generate TARGETS files")
 
diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py
index 493cd8a8a8a..ba90bc612da 100644
--- a/buckifier/targets_builder.py
+++ b/buckifier/targets_builder.py
@@ -3,6 +3,12 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+try:
+    from builtins import object
+    from builtins import str
+except ImportError:
+    from __builtin__ import object
+    from __builtin__ import str
 import targets_cfg
 
 def pretty_list(lst, indent=8):
@@ -18,7 +24,7 @@ def pretty_list(lst, indent=8):
     return res
 
 
-class TARGETSBuilder:
+class TARGETSBuilder(object):
     def __init__(self, path):
         self.path = path
         self.targets_file = open(path, 'w')
@@ -51,14 +57,21 @@ def add_binary(self, name, srcs, deps=None):
             pretty_list(deps)))
         self.total_bin = self.total_bin + 1
 
-    def register_test(self, test_name, src, is_parallel):
+    def register_test(self,
+                      test_name,
+                      src,
+                      is_parallel,
+                      extra_deps,
+                      extra_compiler_flags):
         exec_mode = "serial"
         if is_parallel:
             exec_mode = "parallel"
         self.tests_cfg += targets_cfg.test_cfg_template % (
             test_name,
             str(src),
-            str(exec_mode))
+            str(exec_mode),
+            extra_deps,
+            extra_compiler_flags)
 
         self.total_test = self.total_test + 1
 
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 730b5ebf9da..0ecd6fdda76 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -3,7 +3,13 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-rocksdb_target_header = """load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
+
+rocksdb_target_header = """# This file \100generated by `python buckifier/buckify_rocksdb.py`
+# --> DO NOT EDIT MANUALLY <--
+# This file is a Facebook-specific integration for buck builds, so can
+# only be validated by Facebook employees.
+#
+load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
 load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
 load(":defs.bzl", "test_binary")
 
@@ -11,30 +17,11 @@
 
 ROCKSDB_COMPILER_FLAGS = [
     "-fno-builtin-memcmp",
-    "-DROCKSDB_PLATFORM_POSIX",
-    "-DROCKSDB_LIB_IO_POSIX",
-    "-DROCKSDB_FALLOCATE_PRESENT",
-    "-DROCKSDB_MALLOC_USABLE_SIZE",
-    "-DROCKSDB_RANGESYNC_PRESENT",
-    "-DROCKSDB_SCHED_GETCPU_PRESENT",
-    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
-    "-DOS_LINUX",
-    # Flags to enable libs we include
-    "-DSNAPPY",
-    "-DZLIB",
-    "-DBZIP2",
-    "-DLZ4",
-    "-DZSTD",
-    "-DZSTD_STATIC_LINKING_ONLY",
-    "-DGFLAGS=gflags",
-    "-DNUMA",
-    "-DTBB",
     # Needed to compile in fbcode
     "-Wno-expansion-to-defined",
     # Added missing flags from output of build_detect_platform
-    "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
-    "-DROCKSDB_BACKTRACE",
     "-Wnarrowing",
+    "-DROCKSDB_NO_DYNAMIC_EXTENSION",
 ]
 
 ROCKSDB_EXTERNAL_DEPS = [
@@ -45,11 +32,54 @@
     ("lz4", None, "lz4"),
     ("zstd", None),
     ("tbb", None),
-    ("numa", None, "numa"),
     ("googletest", None, "gtest"),
 ]
 
+ROCKSDB_OS_DEPS = [
+    (
+        "linux",
+        ["third-party//numa:numa"],
+    ),
+]
+
+ROCKSDB_OS_PREPROCESSOR_FLAGS = [
+    (
+        "linux",
+        [
+            "-DOS_LINUX",
+            "-DROCKSDB_FALLOCATE_PRESENT",
+            "-DROCKSDB_MALLOC_USABLE_SIZE",
+            "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
+            "-DROCKSDB_RANGESYNC_PRESENT",
+            "-DROCKSDB_SCHED_GETCPU_PRESENT",
+            "-DHAVE_SSE42",
+            "-DNUMA",
+        ],
+    ),
+    (
+        "macos",
+        ["-DOS_MACOSX"],
+    ),
+]
+
 ROCKSDB_PREPROCESSOR_FLAGS = [
+    "-DROCKSDB_PLATFORM_POSIX",
+    "-DROCKSDB_LIB_IO_POSIX",
+    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
+
+    # Flags to enable libs we include
+    "-DSNAPPY",
+    "-DZLIB",
+    "-DBZIP2",
+    "-DLZ4",
+    "-DZSTD",
+    "-DZSTD_STATIC_LINKING_ONLY",
+    "-DGFLAGS=gflags",
+    "-DTBB",
+
+    # Added missing flags from output of build_detect_platform
+    "-DROCKSDB_BACKTRACE",
+
     # Directories with files for #include
     "-I" + REPO_PATH + "include/",
     "-I" + REPO_PATH,
@@ -57,7 +87,6 @@
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
     "x86_64": [
-        "-DHAVE_SSE42",
         "-DHAVE_PCLMUL",
     ],
 }
@@ -74,9 +103,15 @@
 
 # Do not enable jemalloc if sanitizer presents. RocksDB will further detect
 # whether the binary is linked with jemalloc at runtime.
-ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else [])
-
-ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else [])
+ROCKSDB_OS_PREPROCESSOR_FLAGS += ([(
+    "linux",
+    ["-DROCKSDB_JEMALLOC"],
+)] if sanitizer == "" else [])
+
+ROCKSDB_OS_DEPS += ([(
+    "linux",
+    ["third-party//jemalloc:headers"],
+)] if sanitizer == "" else [])
 """
 
 
@@ -87,6 +122,8 @@
     {headers_attr_prefix}headers = {headers},
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [{deps}],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
@@ -109,11 +146,13 @@
         "%s",
         "%s",
         "%s",
+        %s,
+        %s,
     ],
 """
 
 unittests_template = """
-# [test_name, test_src, test_type]
+# [test_name, test_src, test_type, extra_deps, extra_compiler_flags]
 ROCKS_TESTS = [
 %s]
 
@@ -122,15 +161,19 @@
 # will not be included.
 [
     test_binary(
+        extra_compiler_flags = extra_compiler_flags,
+        extra_deps = extra_deps,
         parallelism = parallelism,
         rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
         rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
         rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS,
+        rocksdb_os_deps = ROCKSDB_OS_DEPS,
+        rocksdb_os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
         rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
         test_cc = test_cc,
         test_name = test_name,
     )
-    for test_name, test_cc, parallelism in ROCKS_TESTS
+    for test_name, test_cc, parallelism, extra_deps, extra_compiler_flags in ROCKS_TESTS
     if not is_opt_mode
 ]
 """
diff --git a/buckifier/util.py b/buckifier/util.py
index 2eda69f1075..f04929a277c 100644
--- a/buckifier/util.py
+++ b/buckifier/util.py
@@ -6,11 +6,16 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
+try:
+    from builtins import object
+except ImportError:
+    from __builtin__ import object
 import subprocess
+import sys
 import os
 import time
 
-class ColorString:
+class ColorString(object):
     """ Generate colorful strings on terminal """
     HEADER = '\033[95m'
     BLUE = '\033[94m'
@@ -21,7 +26,13 @@ class ColorString:
 
     @staticmethod
     def _make_color_str(text, color):
-        return "".join([color, text.encode('utf-8'), ColorString.ENDC])
+        # In Python2, default encoding for unicode string is ASCII
+        if sys.version_info.major <= 2:
+            return "".join(
+                [color, text.encode('utf-8'), ColorString.ENDC])
+        # From Python3, default encoding for unicode string is UTF-8
+        return "".join(
+            [color, text, ColorString.ENDC])
 
     @staticmethod
     def ok(text):
diff --git a/build_tools/RocksDBCommonHelper.php b/build_tools/RocksDBCommonHelper.php
deleted file mode 100644
index e7bfb520347..00000000000
--- a/build_tools/RocksDBCommonHelper.php
+++ /dev/null
@@ -1,377 +0,0 @@
-<?php
-// Copyright 2004-present Facebook. All Rights Reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-// Name of the environment variables which need to be set by the entity which
-// triggers continuous runs so that code at the end of the file gets executed
-// and Sandcastle run starts.
-const ENV_POST_RECEIVE_HOOK = "POST_RECEIVE_HOOK";
-const ENV_HTTPS_APP_VALUE = "HTTPS_APP_VALUE";
-const ENV_HTTPS_TOKEN_VALUE = "HTTPS_TOKEN_VALUE";
-
-const PRIMARY_TOKEN_FILE = '/home/krad/.sandcastle';
-const CONT_RUN_ALIAS = "leveldb";
-
-//////////////////////////////////////////////////////////////////////
-/*  Run tests in sandcastle */
-function postURL($diffID, $url) {
-  assert(strlen($diffID) > 0);
-  assert(is_numeric($diffID));
-  assert(strlen($url) > 0);
-
-  $cmd_args = array(
-    'diff_id' => (int)$diffID,
-    'name' => sprintf(
-      'click here for sandcastle tests for D%d',
-      (int)$diffID
-    ),
-    'link' => $url
-  );
-  $cmd = 'echo ' . escapeshellarg(json_encode($cmd_args))
-         . ' | arc call-conduit differential.updateunitresults';
-
-  shell_exec($cmd);
-}
-
-function buildUpdateTestStatusCmd($diffID, $test, $status) {
-  assert(strlen($diffID) > 0);
-  assert(is_numeric($diffID));
-  assert(strlen($test) > 0);
-  assert(strlen($status) > 0);
-
-  $cmd_args = array(
-    'diff_id' => (int)$diffID,
-    'name' => $test,
-    'result' => $status
-  );
-
-  $cmd = 'echo ' . escapeshellarg(json_encode($cmd_args))
-         . ' | arc call-conduit differential.updateunitresults';
-
-  return $cmd;
-}
-
-function updateTestStatus($diffID, $test) {
-  assert(strlen($diffID) > 0);
-  assert(is_numeric($diffID));
-  assert(strlen($test) > 0);
-
-  shell_exec(buildUpdateTestStatusCmd($diffID, $test, "waiting"));
-}
-
-function getSteps($applyDiff, $diffID, $username, $test) {
-  assert(strlen($username) > 0);
-  assert(strlen($test) > 0);
-
-  if ($applyDiff) {
-    assert(strlen($diffID) > 0);
-    assert(is_numeric($diffID));
-
-    $arcrc_content = (PHP_OS == "Darwin" ?
-        exec("cat ~/.arcrc | gzip -f | base64") :
-            exec("cat ~/.arcrc | gzip -f | base64 -w0"));
-    assert(strlen($arcrc_content) > 0);
-
-    // Sandcastle machines don't have arc setup. We copy the user certificate
-    // and authenticate using that in Sandcastle.
-    $setup = array(
-      "name" => "Setup arcrc",
-      "shell" => "echo " . escapeshellarg($arcrc_content) . " | base64 --decode"
-                 . " | gzip -d > ~/.arcrc",
-      "user" => "root"
-    );
-
-    // arc demands certain permission on its config.
-    // also fix the sticky bit issue in sandcastle
-    $fix_permission = array(
-      "name" => "Fix environment",
-      "shell" => "chmod 600 ~/.arcrc && chmod +t /dev/shm",
-      "user" => "root"
-    );
-
-    // Construct the steps in the order of execution.
-    $steps[] = $setup;
-    $steps[] = $fix_permission;
-  }
-
-  // fbcode is a sub-repo. We cannot patch until we add it to ignore otherwise
-  // Git thinks it is an uncommitted change.
-  $fix_git_ignore = array(
-    "name" => "Fix git ignore",
-    "shell" => "echo fbcode >> .git/info/exclude",
-    "user" => "root"
-  );
-
-  // This fixes "FATAL: ThreadSanitizer can not mmap the shadow memory"
-  // Source:
-  // https://github.com/google/sanitizers/wiki/ThreadSanitizerCppManual#FAQ
-  $fix_kernel_issue = array(
-    "name" => "Fix kernel issue with tsan",
-    "shell" => "echo 2 >/proc/sys/kernel/randomize_va_space",
-    "user" => "root"
-  );
-
-  $steps[] = $fix_git_ignore;
-  $steps[] = $fix_kernel_issue;
-
-  // This will be the command used to execute particular type of tests.
-  $cmd = "";
-
-  if ($applyDiff) {
-    // Patch the code (keep your fingures crossed).
-    $patch = array(
-      "name" => "Patch " . $diffID,
-      "shell" => "arc --arcrc-file ~/.arcrc "
-                  . "patch --nocommit --diff " . escapeshellarg($diffID),
-      "user" => "root"
-    );
-
-    $steps[] = $patch;
-
-    updateTestStatus($diffID, $test);
-    $cmd = buildUpdateTestStatusCmd($diffID, $test, "running") . "; ";
-  }
-
-  // Run the actual command.
-  $cmd = $cmd . "J=$(nproc) ./build_tools/precommit_checker.py " .
-           escapeshellarg($test) . "; exit_code=$?; ";
-
-  if ($applyDiff) {
-    $cmd = $cmd . "([[ \$exit_code -eq 0 ]] &&"
-                . buildUpdateTestStatusCmd($diffID, $test, "pass") . ")"
-                . "||" . buildUpdateTestStatusCmd($diffID, $test, "fail")
-                . "; ";
-  }
-
-  // shell command to sort the tests based on exit code and print
-  // the output of the log files.
-  $cat_sorted_logs = "
-    while read code log_file;
-      do echo \"################ cat \$log_file [exit_code : \$code] ################\";
-      cat \$log_file;
-    done < <(tail -n +2 LOG | sort -k7,7n -k4,4gr | awk '{print \$7,\$NF}')";
-
-  // Shell command to cat all log files
-  $cat_all_logs = "for f in `ls t/!(run-*)`; do echo \$f;cat \$f; done";
-
-  // If LOG file exist use it to cat log files sorted by exit code, otherwise
-  // cat everything
-  $logs_cmd = "if [ -f LOG ]; then {$cat_sorted_logs}; else {$cat_all_logs}; fi";
-
-  $cmd = $cmd . " cat /tmp/precommit-check.log"
-              . "; shopt -s extglob; {$logs_cmd}"
-              . "; shopt -u extglob; [[ \$exit_code -eq 0 ]]";
-  assert(strlen($cmd) > 0);
-
-  $run_test = array(
-    "name" => "Run " . $test,
-    "shell" => $cmd,
-    "user" => "root",
-    "parser" => "python build_tools/error_filter.py " . escapeshellarg($test),
-  );
-
-  $steps[] = $run_test;
-
-  if ($applyDiff) {
-    // Clean up the user arc config we are using.
-    $cleanup = array(
-      "name" => "Arc cleanup",
-      "shell" => "rm -f ~/.arcrc",
-      "user" => "root"
-    );
-
-    $steps[] = $cleanup;
-  }
-
-  assert(count($steps) > 0);
-  return $steps;
-}
-
-function getSandcastleConfig() {
-  $sandcastle_config = array();
-
-  $cwd = getcwd();
-  $cwd_token_file = "{$cwd}/.sandcastle";
-  // This is a case when we're executed from a continuous run. Fetch the values
-  // from the environment.
-  if (getenv(ENV_POST_RECEIVE_HOOK)) {
-    $sandcastle_config[0] = getenv(ENV_HTTPS_APP_VALUE);
-    $sandcastle_config[1] = getenv(ENV_HTTPS_TOKEN_VALUE);
-  } else {
-    // This is a typical `[p]arc diff` case. Fetch the values from the specific
-    // configuration files.
-    for ($i = 0; $i < 50; $i++) {
-      if (file_exists(PRIMARY_TOKEN_FILE) ||
-          file_exists($cwd_token_file)) {
-        break;
-      }
-      // If we failed to fetch the tokens, sleep for 0.2 second and try again
-      usleep(200000);
-    }
-    assert(file_exists(PRIMARY_TOKEN_FILE) ||
-           file_exists($cwd_token_file));
-
-    // Try the primary location first, followed by a secondary.
-    if (file_exists(PRIMARY_TOKEN_FILE)) {
-      $cmd = 'cat ' . PRIMARY_TOKEN_FILE;
-    } else {
-      $cmd = 'cat ' . escapeshellarg($cwd_token_file);
-    }
-
-    assert(strlen($cmd) > 0);
-    $sandcastle_config = explode(':', rtrim(shell_exec($cmd)));
-  }
-
-  // In this case be very explicit about the implications.
-  if (count($sandcastle_config) != 2) {
-    echo "Sandcastle configuration files don't contain valid information " .
-         "or the necessary environment variables aren't defined. Unable " .
-         "to validate the code changes.";
-    exit(1);
-  }
-
-  assert(strlen($sandcastle_config[0]) > 0);
-  assert(strlen($sandcastle_config[1]) > 0);
-  assert(count($sandcastle_config) > 0);
-
-  return $sandcastle_config;
-}
-
-// This function can be called either from `[p]arc diff` command or during
-// the Git post-receive hook.
- function startTestsInSandcastle($applyDiff, $workflow, $diffID) {
-  // Default options don't terminate on failure, but that's what we want. In
-  // the current case we use assertions intentionally as "terminate on failure
-  // invariants".
-  assert_options(ASSERT_BAIL, true);
-
-  // In case of a diff we'll send notificatios to the author. Else it'll go to
-  // the entire team because failures indicate that build quality has regressed.
-  $username = $applyDiff ? exec("whoami") : CONT_RUN_ALIAS;
-  assert(strlen($username) > 0);
-
-  if ($applyDiff) {
-    assert($workflow);
-    assert(strlen($diffID) > 0);
-    assert(is_numeric($diffID));
-  }
-
-  // List of tests we want to run in Sandcastle.
-  $tests = array("unit", "unit_non_shm", "unit_481", "clang_unit", "tsan",
-                 "asan", "lite_test", "valgrind", "release", "release_481",
-                 "clang_release", "clang_analyze", "code_cov",
-                 "java_build", "no_compression", "unity", "ubsan");
-
-  $send_email_template = array(
-    'type' => 'email',
-    'triggers' => array('fail'),
-    'emails' => array($username . '@fb.com'),
-  );
-
-  // Construct a job definition for each test and add it to the master plan.
-  foreach ($tests as $test) {
-    $stepName = "RocksDB diff " . $diffID . " test " . $test;
-
-    if (!$applyDiff) {
-      $stepName = "RocksDB continuous integration test " . $test;
-    }
-
-    $arg[] = array(
-      "name" => $stepName,
-      "report" => array($send_email_template),
-      "steps" => getSteps($applyDiff, $diffID, $username, $test)
-    );
-  }
-
-  // We cannot submit the parallel execution master plan to Sandcastle and
-  // need supply the job plan as a determinator. So we construct a small job
-  // that will spit out the master job plan which Sandcastle will parse and
-  // execute. Why compress the job definitions? Otherwise we run over the max
-  // string size.
-  $cmd = "echo " . base64_encode(json_encode($arg))
-         . (PHP_OS == "Darwin" ?
-             " | gzip -f | base64" :
-                 " | gzip -f | base64 -w0");
-  assert(strlen($cmd) > 0);
-
-  $arg_encoded = shell_exec($cmd);
-  assert(strlen($arg_encoded) > 0);
-
-  $runName = "Run diff " . $diffID . "for user " . $username;
-
-  if (!$applyDiff) {
-    $runName = "RocksDB continuous integration build and test run";
-  }
-
-  $command = array(
-    "name" => $runName,
-    "steps" => array()
-  );
-
-  $command["steps"][] = array(
-    "name" => "Generate determinator",
-    "shell" => "echo " . $arg_encoded . " | base64 --decode | gzip -d"
-               . " | base64 --decode",
-    "determinator" => true,
-    "user" => "root"
-  );
-
-  // Submit to Sandcastle.
-  $url = 'https://interngraph.intern.facebook.com/sandcastle/create';
-
-  $job = array(
-    'command' => 'SandcastleUniversalCommand',
-    'args' => $command,
-    'capabilities' => array(
-      'vcs' => 'rocksdb-int-git',
-      'type' => 'lego',
-    ),
-    'hash' => 'origin/master',
-    'user' => $username,
-    'alias' => 'rocksdb-precommit',
-    'tags' => array('rocksdb'),
-    'description' => 'Rocksdb precommit job',
-  );
-
-  // Fetch the configuration necessary to submit a successful HTTPS request.
-  $sandcastle_config = getSandcastleConfig();
-
-  $app = $sandcastle_config[0];
-  $token = $sandcastle_config[1];
-
-  $cmd = 'curl -s -k '
-          . ' -F app=' . escapeshellarg($app)
-          . ' -F token=' . escapeshellarg($token)
-          . ' -F job=' . escapeshellarg(json_encode($job))
-          .' ' . escapeshellarg($url);
-
-  $output = shell_exec($cmd);
-  assert(strlen($output) > 0);
-
-  // Extract Sandcastle URL from the response.
-  preg_match('/url": "(.+)"/', $output, $sandcastle_url);
-
-  assert(count($sandcastle_url) > 0, "Unable to submit Sandcastle request.");
-  assert(strlen($sandcastle_url[1]) > 0, "Unable to extract Sandcastle URL.");
-
-  if ($applyDiff) {
-    echo "\nSandcastle URL: " . $sandcastle_url[1] . "\n";
-    // Ask Phabricator to display it on the diff UI.
-    postURL($diffID, $sandcastle_url[1]);
-  } else {
-    echo "Continuous integration started Sandcastle tests. You can look at ";
-    echo "the progress at:\n" . $sandcastle_url[1] . "\n";
-  }
-}
-
-// Continuous run cript will set the environment variable and based on that
-// we'll trigger the execution of tests in Sandcastle. In that case we don't
-// need to apply any diffs and there's no associated workflow either.
-if (getenv(ENV_POST_RECEIVE_HOOK)) {
-  startTestsInSandcastle(
-    false /* $applyDiff */,
-    NULL /* $workflow */,
-    NULL /* $diffID */);
-}
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 057f77ec531..45fdbe258ba 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -56,10 +56,10 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
     if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
       # we need this to build with MySQL. Don't use for other purposes.
       source "$PWD/build_tools/fbcode_config4.8.1.sh"
-    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007" ]; then
-      source "$PWD/build_tools/fbcode_config_platform007.sh"
-    else
+    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_5xx" ]; then
       source "$PWD/build_tools/fbcode_config.sh"
+    else
+      source "$PWD/build_tools/fbcode_config_platform007.sh"
     fi
 fi
 
@@ -150,6 +150,9 @@ case "$TARGET_OS" in
             PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
         fi
         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        if test -z "$USE_FOLLY_DISTRIBUTED_MUTEX"; then
+          USE_FOLLY_DISTRIBUTED_MUTEX=1
+        fi
         # PORT_FILES=port/linux/linux_specific.cc
         ;;
     SunOS)
@@ -397,6 +400,7 @@ EOF
           #include <malloc.h>
           int main() {
             size_t res = malloc_usable_size(0);
+            (void)res;
             return 0;
           }
 EOF
@@ -411,6 +415,7 @@ EOF
           #include <pthread.h>
           int main() {
             int x = PTHREAD_MUTEX_ADAPTIVE_NP;
+            (void)x;
             return 0;
           }
 EOF
@@ -422,7 +427,7 @@ EOF
     if ! test $ROCKSDB_DISABLE_BACKTRACE; then
         # Test whether backtrace is available
         $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-          #include <execinfo.h>>
+          #include <execinfo.h>
           int main() {
             void* frames[1];
             backtrace_symbols(frames, backtrace(frames, 1));
@@ -480,6 +485,7 @@ EOF
           #include <sched.h>
           int main() {
             int cpuid = sched_getcpu();
+            (void)cpuid;
           }
 EOF
         if [ "$?" = 0 ]; then
@@ -515,7 +521,7 @@ fi
 
 if test "$USE_HDFS"; then
   if test -z "$JAVA_HOME"; then
-    echo "JAVA_HOME has to be set for HDFS usage."
+    echo "JAVA_HOME has to be set for HDFS usage." >&2
     exit 1
   fi
   HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS -I$HADOOP_HOME/include"
@@ -527,42 +533,64 @@ if test "$USE_HDFS"; then
   JAVA_LDFLAGS="$JAVA_LDFLAGS $HDFS_LDFLAGS"
 fi
 
-if test -z "$PORTABLE"; then
+if test "0$PORTABLE" -eq 0; then
   if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
     # Tune for this POWER processor, treating '+' models as base models
     POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+`
     COMMON_FLAGS="$COMMON_FLAGS -mcpu=$POWER -mtune=$POWER "
   elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
     COMMON_FLAGS="$COMMON_FLAGS -march=z10 "
-  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^arm`"; then
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep -e^arm -e^aarch64`"; then
     # TODO: Handle this with approprite options.
     COMMON_FLAGS="$COMMON_FLAGS"
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then
+    COMMON_FLAGS="$COMMON_FLAGS"
   elif [ "$TARGET_OS" == "IOS" ]; then
     COMMON_FLAGS="$COMMON_FLAGS"
-  elif [ "$TARGET_OS" != "AIX" ] && [ "$TARGET_OS" != "SunOS" ]; then
+  elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then
+    # TODO: Not sure why we don't use -march=native on these OSes
+    if test "$USE_SSE"; then
+      TRY_SSE_ETC="1"
+    fi
+  else
     COMMON_FLAGS="$COMMON_FLAGS -march=native "
-  elif test "$USE_SSE"; then
-    COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul"
   fi
-elif test "$USE_SSE"; then
-  COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul"
+else
+  # PORTABLE=1
+  if test "$USE_SSE"; then
+    TRY_SSE_ETC="1"
+  fi
 fi
 
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+if test "$TRY_SSE_ETC"; then
+  # The USE_SSE flag now means "attempt to compile with widely-available
+  # Intel architecture extensions utilized by specific optimizations in the
+  # source code." It's a qualifier on PORTABLE=1 that means "mostly portable."
+  # It doesn't even really check that your current CPU is compatible.
+  #
+  # SSE4.2 available since nehalem, ca. 2008-2010
+  TRY_SSE42="-msse4.2"
+  # PCLMUL available since westmere, ca. 2010-2011
+  TRY_PCLMUL="-mpclmul"
+  # AVX2 available since haswell, ca. 2013-2015
+  TRY_AVX2="-mavx2"
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o /dev/null 2>/dev/null <<EOF
   #include <cstdint>
   #include <nmmintrin.h>
   int main() {
     volatile uint32_t x = _mm_crc32_u32(0, 0);
+    (void)x;
   }
 EOF
 if [ "$?" = 0 ]; then
-  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_SSE42"
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_SSE42 -DHAVE_SSE42"
 elif test "$USE_SSE"; then
-  echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling"
-  exit 1
+  echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2
 fi
 
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o /dev/null 2>/dev/null <<EOF
   #include <cstdint>
   #include <wmmintrin.h>
   int main() {
@@ -570,13 +598,41 @@ $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
     const auto b = _mm_set_epi64x(0, 0);
     const auto c = _mm_clmulepi64_si128(a, b, 0x00);
     auto d = _mm_cvtsi128_si64(c);
+    (void)d;
   }
 EOF
 if [ "$?" = 0 ]; then
-  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_PCLMUL"
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_PCLMUL -DHAVE_PCLMUL"
 elif test "$USE_SSE"; then
-  echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling"
-  exit 1
+  echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o /dev/null 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <immintrin.h>
+  int main() {
+    const auto a = _mm256_setr_epi32(0, 1, 2, 3, 4, 7, 6, 5);
+    const auto b = _mm256_permutevar8x32_epi32(a, a);
+    (void)b;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS $TRY_AVX2 -DHAVE_AVX2"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+  #include <cstdint>
+  int main() {
+    uint64_t a = 0xffffFFFFffffFFFF;
+    __uint128_t b = __uint128_t(a) * a;
+    a = static_cast<uint64_t>(b >> 64);
+    (void)a;
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_UINT128_EXTENSION"
 fi
 
 # iOS doesn't support thread-local storage, but this check would erroneously
@@ -589,6 +645,7 @@ if [ "$PLATFORM" != IOS ]; then
   #endif
   int main() {
     static __thread int tls;
+    (void)tls;
   }
 EOF
   if [ "$?" = 0 ]; then
@@ -596,6 +653,19 @@ EOF
   fi
 fi
 
+if [ "$FBCODE_BUILD" != "true" -a "$PLATFORM" = OS_LINUX ]; then
+  $CXX $COMMON_FLAGS $PLATFORM_SHARED_CFLAGS -x c++ -c - -o test_dl.o 2>/dev/null <<EOF
+  void dummy_func() {}
+EOF
+  if [ "$?" = 0 ]; then
+    $CXX $COMMON_FLAGS $PLATFORM_SHARED_LDFLAGS test_dl.o -o /dev/null 2>/dev/null
+    if [ "$?" = 0 ]; then
+      EXEC_LDFLAGS+="-ldl"
+      rm -f test_dl.o
+    fi
+  fi
+fi
+
 PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
 PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
 
@@ -640,3 +710,6 @@ if test -n "$WITH_JEMALLOC_FLAG"; then
   echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT"
 fi
 echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT"
+if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then
+  echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT"
+fi
diff --git a/build_tools/cont_integration.sh b/build_tools/cont_integration.sh
deleted file mode 100755
index 66d25522785..00000000000
--- a/build_tools/cont_integration.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright (c) 2016, Facebook. All rights reserved.
-#
-# Overall wrapper script for RocksDB continuous builds. The implementation is a
-# trivial pulling scheme. We loop infinitely, check if any new changes have been
-# committed, if yes then trigger a Sandcastle run, and finally go to sleep again
-# for a certain interval.
-#
-
-SRC_GIT_REPO=/data/git/rocksdb-public
-error=0
-
-function log {
-  DATE=`date +%Y-%m-%d:%H:%M:%S`
-  # shellcheck disable=SC2068
-  echo $DATE $@
-}
-
-function log_err {
-  # shellcheck disable=SC2145
-  log "ERROR: $@ Error code: $error."
-}
-
-function update_repo_status {
-  # Update the parent first.
-  pushd $SRC_GIT_REPO
-
-  # This is a fatal error. Something in the environment isn't right and we will
-  # terminate the execution.
-  error=$?
-  if [ ! $error -eq 0 ]; then
-    log_err "Where is $SRC_GIT_REPO?"
-    exit $error
-  fi
-
-  HTTPS_PROXY=fwdproxy:8080 git fetch -f
-
-  error=$?
-  if [ ! $error -eq 0 ]; then
-    log_err "git fetch -f failed."
-    popd
-    return $error
-  fi
-
-  git update-ref refs/heads/master refs/remotes/origin/master
-
-  error=$?
-  if [ ! $error -eq 0 ]; then
-    log_err "git update-ref failed."
-    popd
-    return $error
-  fi
-
-  popd
-
-  # We're back in an instance-specific directory. Get the latest changes.
-  git pull --rebase
-
-  error=$?
-  if [ ! $error -eq 0 ]; then
-    log_err "git pull --rebase failed."
-    return $error
-  fi
-}
-
-#
-# Execution starts here.
-#
-
-# Path to the determinator from the root of the RocksDB repo.
-CONTRUN_DETERMINATOR=./build_tools/RocksDBCommonHelper.php
-
-# Value of the previous commit.
-PREV_COMMIT=
-
-log "Starting to monitor for new RocksDB changes ..."
-log "Running under `pwd` as `whoami`."
-
-# Paranoia. Make sure that we're using the right branch.
-git checkout master
-
-error=$?
-if [ ! $error -eq 0 ]; then
-  log_err "This is not good. Can't checkout master. Bye-bye!"
-  exit 1
-fi
-
-# We'll run forever and let the execution environment terminate us if we'll
-# exceed whatever timeout is set for the job.
-while true;
-do
-  # Get the latest changes committed.
-  update_repo_status
-
-  error=$?
-  if [  $error -eq 0 ]; then
-    LAST_COMMIT=`git log -1 | head -1 | grep commit | awk '{ print $2; }'`
-
-    log "Last commit is '$LAST_COMMIT', previous commit is '$PREV_COMMIT'."
-
-    if [ "$PREV_COMMIT" == "$LAST_COMMIT" ]; then
-      log "There were no changes since the last time I checked. Going to sleep."
-    else
-      if [ ! -z "$LAST_COMMIT" ]; then
-        log "New code has been committed or previous commit not known. " \
-            "Will trigger the tests."
-
-        PREV_COMMIT=$LAST_COMMIT
-        log "Updated previous commit to '$PREV_COMMIT'."
-
-        #
-        # This is where we'll trigger the Sandcastle run. The values for
-        # HTTPS_APP_VALUE and HTTPS_APP_VALUE will be set in the container we're
-        # running in.
-        #
-        POST_RECEIVE_HOOK=1 php $CONTRUN_DETERMINATOR
-
-        error=$?
-        if [ $error -eq 0 ]; then
-          log "Sandcastle run successfully triggered."
-        else
-          log_err "Failed to trigger Sandcastle run."
-        fi
-      else
-        log_err "Previous commit not updated. Don't know what the last one is."
-      fi
-    fi
-  else
-    log_err "Getting latest changes failed. Will skip running tests for now."
-  fi
-
-  # Always sleep, even if errors happens while trying to determine the latest
-  # commit. This will prevent us terminating in case of transient errors.
-  log "Will go to sleep for 5 minutes."
-  sleep 5m
-done
diff --git a/build_tools/dependencies.sh b/build_tools/dependencies.sh
index c6e074b6460..22454c76fce 100644
--- a/build_tools/dependencies.sh
+++ b/build_tools/dependencies.sh
@@ -1,19 +1,19 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-GCC_BASE=/mnt/gvfs/third-party2/gcc/112ec378fec7002ad3e09afde022e656049f7191/5.x/centos7-native/c447969
-CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/04999bdb3ce81a11073535dcb00b5e13dc1cbaf5/stable/centos7-native/c9f9104
-LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/92b0c8e5c8eecc71eb042594ce1ab3413799b385/5.x/gcc-5-glibc-2.23/339d858
-GLIBC_BASE=/mnt/gvfs/third-party2/glibc/3d8698d5973ba94f41620a80a67e4457fdf01e90/2.23/gcc-5-glibc-2.23/ca1d1c0
+GCC_BASE=/mnt/gvfs/third-party2/gcc/7331085db891a2ef4a88a48a751d834e8d68f4cb/5.x/centos7-native/c447969
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/1bd23f9917738974ad0ff305aa23eb5f93f18305/9.0.0/centos7-native/c9f9104
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/6ace84e956873d53638c738b6f65f3f469cca74c/5.x/gcc-5-glibc-2.23/339d858
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/192b0f42d63dcf6210d6ceae387b49af049e6e0c/2.23/gcc-5-glibc-2.23/ca1d1c0
 SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/gcc-5-glibc-2.23/9bc6787
-ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/gcc-5-glibc-2.23/9bc6787
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/2d9f0b9a4274cc21f61272a9e89bdb859bce8f1f/1.2.8/gcc-5-glibc-2.23/9bc6787
 BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/gcc-5-glibc-2.23/9bc6787
-LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/gcc-5-glibc-2.23/9bc6787
-ZSTD_BASE=/mnt/gvfs/third-party2/zstd/af6628a46758f1a15484a1760cd7294164bc5ba1/1.3.5/gcc-5-glibc-2.23/03859b5
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/0f607f8fc442ea7d6b876931b1898bb573d5e5da/1.9.1/gcc-5-glibc-2.23/9bc6787
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/ca22bc441a4eb709e9e0b1f9fec9750fed7b31c5/1.4.x/gcc-5-glibc-2.23/03859b5
 GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/gcc-5-glibc-2.23/9bc6787
-JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b1a0e56c1e3e6929813a4331ade3a58ff083afbb/master/gcc-5-glibc-2.23/aa64d6b
-NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/gcc-5-glibc-2.23/9bc6787
-LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/gcc-5-glibc-2.23/b443de1
-TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/gcc-5-glibc-2.23/9bc6787
-KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
-BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/55031de95a2b46c82948743419a603b3d6aefe28/2.29.1/centos7-native/da39a3e
-VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/gcc-5-glibc-2.23/9bc6787
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/c26f08f47ac35fc31da2633b7da92d6b863246eb/master/gcc-5-glibc-2.23/0c8f76d
+NUMA_BASE=/mnt/gvfs/third-party2/numa/3f3fb57a5ccc5fd21c66416c0b83e0aa76a05376/2.0.11/gcc-5-glibc-2.23/9bc6787
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/40c73d874898b386a71847f1b99115d93822d11f/1.4/gcc-5-glibc-2.23/b443de1
+TBB_BASE=/mnt/gvfs/third-party2/tbb/4ce8e8dba77cdbd81b75d6f0c32fd7a1b76a11ec/2018_U5/gcc-5-glibc-2.23/9bc6787
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/fb251ecd2f5ae16f8671f7014c246e52a748fe0b/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/2e3cb7d119b3cea5f1e738cc13a1ac69f49eb875/2.29.1/centos7-native/da39a3e
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d42d152a15636529b0861ec493927200ebebca8e/3.15.0/gcc-5-glibc-2.23/9bc6787
 LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.2.3/gcc-5-glibc-2.23/65372bd
diff --git a/build_tools/dependencies_platform007.sh b/build_tools/dependencies_platform007.sh
index 004bccb5365..1de8e785a80 100644
--- a/build_tools/dependencies_platform007.sh
+++ b/build_tools/dependencies_platform007.sh
@@ -1,19 +1,19 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-GCC_BASE=/mnt/gvfs/third-party2/gcc/6e8e715624fd15256a7970073387793dfcf79b46/7.x/centos7-native/b2ef2b6
-CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/ef37e1faa1c29782abfac1ae65a291b9b7966f6d/stable/centos7-native/c9f9104
-LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/c67031f0f739ac61575a061518d6ef5038f99f90/7.x/platform007/5620abc
-GLIBC_BASE=/mnt/gvfs/third-party2/glibc/60d6f124a78798b73944f5ba87c2306ae3460153/2.26/platform007/f259413
+GCC_BASE=/mnt/gvfs/third-party2/gcc/7331085db891a2ef4a88a48a751d834e8d68f4cb/7.x/centos7-native/b2ef2b6
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/963d9aeda70cc4779885b1277484fe7544a04e3e/9.0.0/platform007/9e92d53/
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/6ace84e956873d53638c738b6f65f3f469cca74c/7.x/platform007/5620abc
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/192b0f42d63dcf6210d6ceae387b49af049e6e0c/2.26/platform007/f259413
 SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/platform007/ca4da3d
-ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/platform007/ca4da3d
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/2d9f0b9a4274cc21f61272a9e89bdb859bce8f1f/1.2.8/platform007/ca4da3d
 BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/platform007/ca4da3d
-LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/platform007/ca4da3d
-ZSTD_BASE=/mnt/gvfs/third-party2/zstd/3ee276cbacfad3074e3f07bf826ac47f06970f4e/1.3.5/platform007/15a3614
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/0f607f8fc442ea7d6b876931b1898bb573d5e5da/1.9.1/platform007/ca4da3d
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/ca22bc441a4eb709e9e0b1f9fec9750fed7b31c5/1.4.x/platform007/15a3614
 GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/platform007/ca4da3d
-JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/9c910d36d6235cc40e8ff559358f1833452300ca/master/platform007/5b0f53e
-NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/platform007/ca4da3d
-LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/platform007/6f3e0a9
-TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/platform007/ca4da3d
-KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/fb/platform007/da39a3e
-BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/92ff90349e2f43ea0a8246d8b1cf17b6869013e3/2.29.1/centos7-native/da39a3e
-VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/platform007/ca4da3d
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/c26f08f47ac35fc31da2633b7da92d6b863246eb/master/platform007/c26c002
+NUMA_BASE=/mnt/gvfs/third-party2/numa/3f3fb57a5ccc5fd21c66416c0b83e0aa76a05376/2.0.11/platform007/ca4da3d
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/40c73d874898b386a71847f1b99115d93822d11f/1.4/platform007/6f3e0a9
+TBB_BASE=/mnt/gvfs/third-party2/tbb/4ce8e8dba77cdbd81b75d6f0c32fd7a1b76a11ec/2018_U5/platform007/ca4da3d
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/fb251ecd2f5ae16f8671f7014c246e52a748fe0b/fb/platform007/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/ab9f09bba370e7066cafd4eb59752db93f2e8312/2.29.1/platform007/15a3614
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d42d152a15636529b0861ec493927200ebebca8e/3.15.0/platform007/ca4da3d
 LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.3.4/platform007/5007832
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index 4415f87da38..4834be5f4c1 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -86,9 +86,10 @@ else
 fi
 CFLAGS+=" -DTBB"
 
-# use Intel SSE support for checksum calculations
-export USE_SSE=1
-export PORTABLE=1
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
 
 BINUTILS="$BINUTILS_BASE/bin"
 AR="$BINUTILS/ar"
@@ -159,4 +160,6 @@ else
   LUA_LIB=" $LUA_PATH/lib/liblua_pic.a"
 fi
 
+USE_FOLLY_DISTRIBUTED_MUTEX=1
+
 export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff --git a/build_tools/fbcode_config4.8.1.sh b/build_tools/fbcode_config4.8.1.sh
index 80fbdf431b9..5f0813a041e 100644
--- a/build_tools/fbcode_config4.8.1.sh
+++ b/build_tools/fbcode_config4.8.1.sh
@@ -53,9 +53,10 @@ LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a"
 TBB_INCLUDE=" -isystem $TBB_BASE/include/"
 TBB_LIBS="$TBB_BASE/lib/libtbb.a"
 
-# use Intel SSE support for checksum calculations
-export USE_SSE=1
-export PORTABLE=1
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
 
 BINUTILS="$BINUTILS_BASE/bin"
 AR="$BINUTILS/ar"
diff --git a/build_tools/fbcode_config_platform007.sh b/build_tools/fbcode_config_platform007.sh
index 1a1e4208139..51edf134fcb 100644
--- a/build_tools/fbcode_config_platform007.sh
+++ b/build_tools/fbcode_config_platform007.sh
@@ -86,9 +86,10 @@ else
 fi
 CFLAGS+=" -DTBB"
 
-# use Intel SSE support for checksum calculations
-export USE_SSE=1
-export PORTABLE=1
+test "$USE_SSE" || USE_SSE=1
+export USE_SSE
+test "$PORTABLE" || PORTABLE=1
+export PORTABLE
 
 BINUTILS="$BINUTILS_BASE/bin"
 AR="$BINUTILS/ar"
@@ -155,4 +156,6 @@ VALGRIND_VER="$VALGRIND_BASE/bin/"
 LUA_PATH=
 LUA_LIB=
 
+USE_FOLLY_DISTRIBUTED_MUTEX=1
+
 export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index f8fbb64fbae..4c0f0e5d5d2 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -54,15 +54,25 @@ fi
 set -e
 
 uncommitted_code=`git diff HEAD`
-LAST_MASTER=`git merge-base master HEAD`
 
 # If there's no uncommitted changes, we assume user are doing post-commit
-# format check, in which case we'll check the modified lines since last commit
-# from master. Otherwise, we'll check format of the uncommitted code only.
+# format check, in which case we'll try to check the modified lines vs. the
+# facebook/rocksdb.git master branch. Otherwise, we'll check format of the
+# uncommitted code only.
 if [ -z "$uncommitted_code" ]
 then
-  # Check the format of last commit
-  diffs=$(git diff -U0 $LAST_MASTER^ | $CLANG_FORMAT_DIFF -p 1)
+  # Attempt to get name of facebook/rocksdb.git remote.
+  [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)"
+  # Fall back on 'origin' if that fails
+  [ "$FORMAT_REMOTE" ] || FORMAT_REMOTE=origin
+  # Use master branch from that remote
+  [ "$FORMAT_UPSTREAM" ] || FORMAT_UPSTREAM="$FORMAT_REMOTE/master"
+  # Get the common ancestor with that remote branch. Everything after that
+  # common ancestor would be considered the contents of a pull request, so
+  # should be relevant for formatting fixes.
+  FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)"
+  # Get the differences
+  diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1)
 else
   # Check the format of uncommitted lines,
   diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
@@ -76,12 +86,12 @@ fi
 
 # Highlight the insertion/deletion from the clang-format-diff.py's output
 COLOR_END="\033[0m"
-COLOR_RED="\033[0;31m" 
-COLOR_GREEN="\033[0;32m" 
+COLOR_RED="\033[0;31m"
+COLOR_GREEN="\033[0;32m"
 
 echo -e "Detect lines that doesn't follow the format rules:\r"
 # Add the color to the diff. lines added will be green; lines removed will be red.
-echo "$diffs" | 
+echo "$diffs" |
   sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
   sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
 
@@ -104,7 +114,7 @@ fi
 # Do in-place format adjustment.
 if [ -z "$uncommitted_code" ]
 then
-  git diff -U0 $LAST_MASTER^ | $CLANG_FORMAT_DIFF -i -p 1
+  git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1
 else
   git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
 fi
diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator
index 2447a19ae44..f4ea9ca346e 100755
--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@@ -63,6 +63,21 @@ CLEANUP_ENV="
     'user':'root'
 }"
 
+UPLOAD_DB_DIR="
+{
+    'name':'Upload database directory',
+    'shell':'tar -cvzf rocksdb_db.tar.gz /dev/shm/rocksdb/',
+    'user':'root',
+    'cleanup':true,
+    'provide_artifacts': [
+    {
+      'name':'rocksdb_db_dir',
+      'paths': ['rocksdb_db.tar.gz'],
+      'bundle': false,
+    },
+    ],
+}"
+
 # We will eventually set the RATIO to 1, but we want do this
 # in steps. RATIO=$(nproc) will make it work as J=1
 if [ -z $RATIO ]; then
@@ -109,13 +124,6 @@ else
   TASK_CREATION_TOOL="false"
 fi
 
-ARTIFACTS=" 'artifacts': [
-    {
-        'name':'database',
-        'paths':[ '/dev/shm/rocksdb' ],
-    }
-]"
-
 #
 # A mechanism to disable tests temporarily
 #
@@ -140,6 +148,7 @@ UNIT_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -160,6 +169,7 @@ UNIT_TEST_NON_SHM_COMMANDS="[
     {
         'name':'Rocksdb Unit Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -182,6 +192,7 @@ RELEASE_BUILD_COMMANDS="[
     {
         'name':'Rocksdb Release Build',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -202,6 +213,7 @@ UNIT_TEST_COMMANDS_481="[
     {
         'name':'Rocksdb Unit Test on GCC 4.8.1',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -222,6 +234,7 @@ RELEASE_BUILD_COMMANDS_481="[
     {
         'name':'Rocksdb Release on GCC 4.8.1',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -242,6 +255,7 @@ CLANG_UNIT_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -262,6 +276,7 @@ CLANG_RELEASE_BUILD_COMMANDS="[
     {
         'name':'Rocksdb CLANG Release Build',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -282,6 +297,7 @@ CLANG_ANALYZE_COMMANDS="[
     {
         'name':'Rocksdb analyze',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -302,6 +318,7 @@ CODE_COV_COMMANDS="[
     {
         'name':'Rocksdb Unit Test Code Coverage',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -322,6 +339,7 @@ UNITY_COMMANDS="[
     {
         'name':'Rocksdb Unity',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -342,6 +360,7 @@ LITE_BUILD_COMMANDS="[
     {
         'name':'Rocksdb Lite build',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -361,6 +380,7 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[
     {
         'name':'Rocksdb Lite Binary Size',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -376,8 +396,9 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[
 #
 STRESS_CRASH_TEST_COMMANDS="[
     {
-        'name':'Rocksdb Stress/Crash Test',
+        'name':'Rocksdb Stress and Crash Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -395,7 +416,6 @@ STRESS_CRASH_TEST_COMMANDS="[
                 $PARSER
             }
         ],
-        $ARTIFACTS,
         $REPORT
     }
 ]"
@@ -405,8 +425,9 @@ STRESS_CRASH_TEST_COMMANDS="[
 #
 STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb Stress/Crash Test (atomic flush)',
+        'name':'Rocksdb Stress and Crash Test with atomic flush',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -422,9 +443,9 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
                 'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
-            }
+            },
+            $UPLOAD_DB_DIR,
         ],
-        $ARTIFACTS,
         $REPORT
     }
 ]"
@@ -436,6 +457,7 @@ WRITE_STRESS_COMMANDS="[
     {
         'name':'Rocksdb Write Stress Test',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -458,6 +480,7 @@ ASAN_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test under ASAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -478,6 +501,7 @@ ASAN_CRASH_TEST_COMMANDS="[
     {
         'name':'Rocksdb crash test under ASAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -498,8 +522,9 @@ ASAN_CRASH_TEST_COMMANDS="[
 #
 ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb crash test (atomic flush) under ASAN',
+        'name':'Rocksdb crash test with atomic flush under ASAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -510,6 +535,7 @@ ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
                 'user':'root',
                 $PARSER
             },
+            $UPLOAD_DB_DIR,
         ],
         $REPORT
     }
@@ -522,11 +548,12 @@ UBSAN_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test under UBSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
                 'name':'Test RocksDB debug under UBSAN',
-                'shell':'set -o pipefail && $SHM $UBSAN $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL',
+                'shell':'set -o pipefail && $SHM $UBSAN $CLANG $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             }
@@ -542,13 +569,14 @@ UBSAN_CRASH_TEST_COMMANDS="[
     {
         'name':'Rocksdb crash test under UBSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB debug ubsan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -562,18 +590,20 @@ UBSAN_CRASH_TEST_COMMANDS="[
 #
 UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
-        'name':'Rocksdb crash test (atomic flush) under UBSAN',
+        'name':'Rocksdb crash test with atomic flush under UBSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB debug ubsan_crash_test_with_atomic_flush',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
+            $UPLOAD_DB_DIR,
         ],
         $REPORT
     }
@@ -586,6 +616,7 @@ VALGRIND_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test under valgrind',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -608,6 +639,7 @@ TSAN_UNIT_TEST_COMMANDS="[
     {
         'name':'Rocksdb Unit Test under TSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -630,6 +662,7 @@ TSAN_CRASH_TEST_COMMANDS="[
     {
         'name':'Rocksdb Crash Test under TSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -652,6 +685,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
     {
         'name':'Rocksdb Crash Test with atomic flush under TSAN',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'timeout': 86400,
         'steps': [
             $CLEANUP_ENV,
@@ -662,6 +696,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
                 'user':'root',
                 $PARSER
             },
+            $UPLOAD_DB_DIR,
         ],
         $REPORT
     }
@@ -684,6 +719,7 @@ FORMAT_COMPATIBLE_COMMANDS="[
     {
         'name':'Rocksdb Format Compatible tests',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -717,6 +753,7 @@ NO_COMPRESSION_COMMANDS="[
     {
         'name':'Rocksdb No Compression tests',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
@@ -794,6 +831,7 @@ JAVA_BUILD_TEST_COMMANDS="[
     {
         'name':'Rocksdb Java Build',
         'oncall':'$ONCALL',
+        'executeLocal': 'true',
         'steps': [
             $CLEANUP_ENV,
             {
diff --git a/build_tools/update_dependencies.sh b/build_tools/update_dependencies.sh
index 9af8c60d1d8..dbc95a6e545 100755
--- a/build_tools/update_dependencies.sh
+++ b/build_tools/update_dependencies.sh
@@ -6,6 +6,12 @@
 BASEDIR=$(dirname $0)
 OUTPUT=""
 
+function log_header()
+{
+  echo "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved." >> "$OUTPUT"
+}
+
+
 function log_variable()
 {
   echo "$1=${!1}" >> "$OUTPUT"
@@ -69,6 +75,7 @@ echo "Writing dependencies to $OUTPUT"
 GCC_BASE=`readlink -f $TP2_LATEST/gcc/7.x/centos7-native/*/`
 CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/`
 
+log_header
 log_variable GCC_BASE
 log_variable CLANG_BASE
 
@@ -108,6 +115,7 @@ echo "Writing dependencies to $OUTPUT"
 GCC_BASE=`readlink -f $TP2_LATEST/gcc/5.x/centos7-native/*/`
 CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/`
 
+log_header
 log_variable GCC_BASE
 log_variable CLANG_BASE
 
@@ -147,6 +155,7 @@ echo "Writing 4.8.1 dependencies to $OUTPUT"
 GCC_BASE=`readlink -f $TP2_LATEST/gcc/4.8.1/centos6-native/*/`
 CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos6-native/*/`
 
+log_header
 log_variable GCC_BASE
 log_variable CLANG_BASE
 
diff --git a/cache/cache_bench.cc b/cache/cache_bench.cc
index 098813d9d74..288662ad9df 100644
--- a/cache/cache_bench.cc
+++ b/cache/cache_bench.cc
@@ -3,9 +3,6 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
@@ -14,9 +11,9 @@ int main() {
 }
 #else
 
-#include <inttypes.h>
-#include <sys/types.h>
 #include <stdio.h>
+#include <sys/types.h>
+#include <cinttypes>
 
 #include "port/port.h"
 #include "rocksdb/cache.h"
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index f9f77234cdb..a0f75bfdc4d 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -16,9 +16,9 @@
 #include <vector>
 #include "cache/clock_cache.h"
 #include "cache/lru_cache.h"
+#include "test_util/testharness.h"
 #include "util/coding.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
@@ -86,14 +86,22 @@ class CacheTest : public testing::TestWithParam<std::string> {
     return nullptr;
   }
 
-  std::shared_ptr<Cache> NewCache(size_t capacity, int num_shard_bits,
-                                  bool strict_capacity_limit) {
+  std::shared_ptr<Cache> NewCache(
+      size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+      CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) {
     auto type = GetParam();
     if (type == kLRU) {
-      return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit);
+      LRUCacheOptions co;
+      co.capacity = capacity;
+      co.num_shard_bits = num_shard_bits;
+      co.strict_capacity_limit = strict_capacity_limit;
+      co.high_pri_pool_ratio = 0;
+      co.metadata_charge_policy = charge_policy;
+      return NewLRUCache(co);
     }
     if (type == kClock) {
-      return NewClockCache(capacity, num_shard_bits, strict_capacity_limit);
+      return NewClockCache(capacity, num_shard_bits, strict_capacity_limit,
+                           charge_policy);
     }
     return nullptr;
   }
@@ -143,10 +151,15 @@ class CacheTest : public testing::TestWithParam<std::string> {
 };
 CacheTest* CacheTest::current_;
 
+class LRUCacheTest : public CacheTest {};
+
 TEST_P(CacheTest, UsageTest) {
   // cache is std::shared_ptr and will be automatically cleaned up.
   const uint64_t kCapacity = 100000;
-  auto cache = NewCache(kCapacity, 8, false);
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata);
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
 
   size_t usage = 0;
   char value[10] = "abcdef";
@@ -155,31 +168,45 @@ TEST_P(CacheTest, UsageTest) {
     std::string key(i, 'a');
     auto kv_size = key.size() + 5;
     cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                          dumbDeleter);
     usage += kv_size;
     ASSERT_EQ(usage, cache->GetUsage());
+    ASSERT_LT(usage, precise_cache->GetUsage());
   }
 
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
+
   // make sure the cache will be overloaded
   for (uint64_t i = 1; i < kCapacity; ++i) {
     auto key = ToString(i);
     cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
                   dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                          dumbDeleter);
   }
 
   // the usage should be close to the capacity
   ASSERT_GT(kCapacity, cache->GetUsage());
+  ASSERT_GT(kCapacity, precise_cache->GetUsage());
   ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
+  ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage());
 }
 
 TEST_P(CacheTest, PinnedUsageTest) {
   // cache is std::shared_ptr and will be automatically cleaned up.
-  const uint64_t kCapacity = 100000;
-  auto cache = NewCache(kCapacity, 8, false);
+  const uint64_t kCapacity = 200000;
+  auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata);
+  auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata);
 
   size_t pinned_usage = 0;
   char value[10] = "abcdef";
 
   std::forward_list<Cache::Handle*> unreleased_handles;
+  std::forward_list<Cache::Handle*> unreleased_handles_in_precise_cache;
 
   // Add entries. Unpin some of them after insertion. Then, pin some of them
   // again. Check GetPinnedUsage().
@@ -187,40 +214,72 @@ TEST_P(CacheTest, PinnedUsageTest) {
     std::string key(i, 'a');
     auto kv_size = key.size() + 5;
     Cache::Handle* handle;
+    Cache::Handle* handle_in_precise_cache;
     cache->Insert(key, reinterpret_cast<void*>(value), kv_size, dumbDeleter,
                   &handle);
+    assert(handle);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
+                          dumbDeleter, &handle_in_precise_cache);
+    assert(handle_in_precise_cache);
     pinned_usage += kv_size;
     ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+    ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
     if (i % 2 == 0) {
       cache->Release(handle);
+      precise_cache->Release(handle_in_precise_cache);
       pinned_usage -= kv_size;
       ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
     } else {
       unreleased_handles.push_front(handle);
+      unreleased_handles_in_precise_cache.push_front(handle_in_precise_cache);
     }
     if (i % 3 == 0) {
       unreleased_handles.push_front(cache->Lookup(key));
+      auto x = precise_cache->Lookup(key);
+      assert(x);
+      unreleased_handles_in_precise_cache.push_front(x);
       // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned
       // usage increased
       if (i % 2 == 0) {
         pinned_usage += kv_size;
       }
       ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+      ASSERT_LT(pinned_usage, precise_cache->GetPinnedUsage());
     }
   }
+  auto precise_cache_pinned_usage = precise_cache->GetPinnedUsage();
+  ASSERT_LT(pinned_usage, precise_cache_pinned_usage);
 
   // check that overloading the cache does not change the pinned usage
   for (uint64_t i = 1; i < 2 * kCapacity; ++i) {
     auto key = ToString(i);
     cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
                   dumbDeleter);
+    precise_cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
+                          dumbDeleter);
   }
   ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
+
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
 
   // release handles for pinned entries to prevent memory leaks
   for (auto handle : unreleased_handles) {
     cache->Release(handle);
   }
+  for (auto handle : unreleased_handles_in_precise_cache) {
+    precise_cache->Release(handle);
+  }
+  ASSERT_EQ(0, cache->GetPinnedUsage());
+  ASSERT_EQ(0, precise_cache->GetPinnedUsage());
+  cache->EraseUnRefEntries();
+  precise_cache->EraseUnRefEntries();
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_EQ(0, precise_cache->GetUsage());
 }
 
 TEST_P(CacheTest, HitAndMiss) {
@@ -306,7 +365,7 @@ TEST_P(CacheTest, EvictionPolicy) {
   Insert(200, 201);
 
   // Frequently used entry must be kept around
-  for (int i = 0; i < kCacheSize + 200; i++) {
+  for (int i = 0; i < kCacheSize * 2; i++) {
     Insert(1000+i, 2000+i);
     ASSERT_EQ(101, Lookup(100));
   }
@@ -359,7 +418,7 @@ TEST_P(CacheTest, EvictionPolicyRef) {
   Insert(303, 104);
 
   // Insert entries much more than Cache capacity
-  for (int i = 0; i < kCacheSize + 200; i++) {
+  for (int i = 0; i < kCacheSize * 2; i++) {
     Insert(1000 + i, 2000 + i);
   }
 
@@ -550,10 +609,10 @@ TEST_P(CacheTest, SetCapacity) {
   }
 }
 
-TEST_P(CacheTest, SetStrictCapacityLimit) {
+TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
   // test1: set the flag to false. Insert more keys than capacity. See if they
   // all go through.
-  std::shared_ptr<Cache> cache = NewLRUCache(5, 0, false);
+  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
   std::vector<Cache::Handle*> handles(10);
   Status s;
   for (size_t i = 0; i < 10; i++) {
@@ -562,6 +621,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
     ASSERT_OK(s);
     ASSERT_NE(nullptr, handles[i]);
   }
+  ASSERT_EQ(10, cache->GetUsage());
 
   // test2: set the flag to true. Insert and check if it fails.
   std::string extra_key = "extra";
@@ -571,13 +631,14 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
   s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle);
   ASSERT_TRUE(s.IsIncomplete());
   ASSERT_EQ(nullptr, handle);
+  ASSERT_EQ(10, cache->GetUsage());
 
   for (size_t i = 0; i < 10; i++) {
     cache->Release(handles[i]);
   }
 
   // test3: init with flag being true.
-  std::shared_ptr<Cache> cache2 = NewLRUCache(5, 0, true);
+  std::shared_ptr<Cache> cache2 = NewCache(5, 0, true);
   for (size_t i = 0; i < 5; i++) {
     std::string key = ToString(i + 1);
     s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
@@ -591,7 +652,7 @@ TEST_P(CacheTest, SetStrictCapacityLimit) {
   s = cache2->Insert(extra_key, extra_value, 1, &deleter);
   // AS if the key have been inserted into cache but get evicted immediately.
   ASSERT_OK(s);
-  ASSERT_EQ(5, cache->GetUsage());
+  ASSERT_EQ(5, cache2->GetUsage());
   ASSERT_EQ(nullptr, cache2->Lookup(extra_key));
 
   for (size_t i = 0; i < 5; i++) {
@@ -686,14 +747,23 @@ TEST_P(CacheTest, DefaultShardBits) {
   ASSERT_EQ(6, sc->GetNumShardBits());
 }
 
+TEST_P(CacheTest, GetCharge) {
+  Insert(1, 2);
+  Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
+  ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
+  ASSERT_EQ(1, cache_->GetCharge(h1));
+  cache_->Release(h1);
+}
+
 #ifdef SUPPORT_CLOCK_CACHE
-std::shared_ptr<Cache> (*new_clock_cache_func)(size_t, int,
-                                               bool) = NewClockCache;
+std::shared_ptr<Cache> (*new_clock_cache_func)(
+    size_t, int, bool, CacheMetadataChargePolicy) = NewClockCache;
 INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
                         testing::Values(kLRU, kClock));
 #else
 INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, testing::Values(kLRU));
 #endif  // SUPPORT_CLOCK_CACHE
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU));
 
 }  // namespace rocksdb
 
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 89173834e23..9165ad5dd10 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -13,8 +13,9 @@
 
 namespace rocksdb {
 
-std::shared_ptr<Cache> NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/,
-                                     bool /*strict_capacity_limit*/) {
+std::shared_ptr<Cache> NewClockCache(
+    size_t /*capacity*/, int /*num_shard_bits*/, bool /*strict_capacity_limit*/,
+    CacheMetadataChargePolicy /*metadata_charge_policy*/) {
   // Clock cache not supported.
   return nullptr;
 }
@@ -35,6 +36,7 @@ std::shared_ptr<Cache> NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/
 #include "tbb/concurrent_hash_map.h"
 
 #include "cache/sharded_cache.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "util/autovector.h"
 #include "util/mutexlock.h"
@@ -202,6 +204,27 @@ struct CacheHandle {
     deleter = a.deleter;
     return *this;
   }
+
+  inline static size_t CalcTotalCharge(
+      Slice key, size_t charge,
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    size_t meta_charge = 0;
+    if (metadata_charge_policy == kFullChargeCacheMetadata) {
+      meta_charge += sizeof(CacheHandle);
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      meta_charge +=
+          malloc_usable_size(static_cast<void*>(const_cast<char*>(key.data())));
+#else
+      meta_charge += key.size();
+#endif
+    }
+    return charge + meta_charge;
+  }
+
+  inline size_t CalcTotalCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    return CalcTotalCharge(key, charge, metadata_charge_policy);
+  }
 };
 
 // Key of hash map. We store hash value with the key for convenience.
@@ -404,11 +427,12 @@ void ClockCacheShard::RecycleHandle(CacheHandle* handle,
   assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0);
   context->to_delete_key.push_back(handle->key.data());
   context->to_delete_value.emplace_back(*handle);
+  size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
   handle->key.clear();
   handle->value = nullptr;
   handle->deleter = nullptr;
   recycle_.push_back(handle);
-  usage_.fetch_sub(handle->charge, std::memory_order_relaxed);
+  usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 }
 
 void ClockCacheShard::Cleanup(const CleanupContext& context) {
@@ -434,7 +458,8 @@ bool ClockCacheShard::Ref(Cache::Handle* h) {
                                             std::memory_order_relaxed)) {
       if (CountRefs(flags) == 0) {
         // No reference count before the operation.
-        pinned_usage_.fetch_add(handle->charge, std::memory_order_relaxed);
+        size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+        pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
       }
       return true;
     }
@@ -454,7 +479,8 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
   assert(CountRefs(flags) > 0);
   if (CountRefs(flags) == 1) {
     // this is the last reference.
-    pinned_usage_.fetch_sub(handle->charge, std::memory_order_relaxed);
+    size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+    pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
     // Cleanup if it is the last reference.
     if (!InCache(flags)) {
       MutexLock l(&mutex_);
@@ -539,8 +565,10 @@ CacheHandle* ClockCacheShard::Insert(
     const Slice& key, uint32_t hash, void* value, size_t charge,
     void (*deleter)(const Slice& key, void* value), bool hold_reference,
     CleanupContext* context) {
+  size_t total_charge =
+      CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_);
   MutexLock l(&mutex_);
-  bool success = EvictFromCache(charge, context);
+  bool success = EvictFromCache(total_charge, context);
   bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
   if (!success && (strict || !hold_reference)) {
     context->to_delete_key.push_back(key.data());
@@ -575,9 +603,9 @@ CacheHandle* ClockCacheShard::Insert(
   }
   table_.insert(HashTable::value_type(CacheKey(key, hash), handle));
   if (hold_reference) {
-    pinned_usage_.fetch_add(charge, std::memory_order_relaxed);
+    pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
   }
-  usage_.fetch_add(charge, std::memory_order_relaxed);
+  usage_.fetch_add(total_charge, std::memory_order_relaxed);
   return handle;
 }
 
@@ -674,10 +702,14 @@ void ClockCacheShard::EraseUnRefEntries() {
 
 class ClockCache final : public ShardedCache {
  public:
-  ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit)
+  ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+             CacheMetadataChargePolicy metadata_charge_policy)
       : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
     int num_shards = 1 << num_shard_bits;
     shards_ = new ClockCacheShard[num_shards];
+    for (int i = 0; i < num_shards; i++) {
+      shards_[i].set_metadata_charge_policy(metadata_charge_policy);
+    }
     SetCapacity(capacity);
     SetStrictCapacityLimit(strict_capacity_limit);
   }
@@ -714,13 +746,14 @@ class ClockCache final : public ShardedCache {
 
 }  // end anonymous namespace
 
-std::shared_ptr<Cache> NewClockCache(size_t capacity, int num_shard_bits,
-                                     bool strict_capacity_limit) {
+std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy) {
   if (num_shard_bits < 0) {
     num_shard_bits = GetDefaultCacheShardBits(capacity);
   }
-  return std::make_shared<ClockCache>(capacity, num_shard_bits,
-                                      strict_capacity_limit);
+  return std::make_shared<ClockCache>(
+      capacity, num_shard_bits, strict_capacity_limit, metadata_charge_policy);
 }
 
 }  // namespace rocksdb
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index fdcbb4e86cb..0e49167ed5b 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "cache/lru_cache.h"
 
 #include <assert.h>
@@ -28,7 +24,7 @@ LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
 
 LRUHandleTable::~LRUHandleTable() {
   ApplyToAllCacheEntries([](LRUHandle* h) {
-    if (h->refs == 1) {
+    if (!h->HasRefs()) {
       h->Free();
     }
   });
@@ -101,7 +97,8 @@ void LRUHandleTable::Resize() {
 
 LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                              double high_pri_pool_ratio,
-                             bool use_adaptive_mutex)
+                             bool use_adaptive_mutex,
+                             CacheMetadataChargePolicy metadata_charge_policy)
     : capacity_(0),
       high_pri_pool_usage_(0),
       strict_capacity_limit_(strict_capacity_limit),
@@ -110,6 +107,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
       usage_(0),
       lru_usage_(0),
       mutex_(use_adaptive_mutex) {
+  set_metadata_charge_policy(metadata_charge_policy);
   // Make empty circular linked list
   lru_.next = &lru_;
   lru_.prev = &lru_;
@@ -117,30 +115,20 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
   SetCapacity(capacity);
 }
 
-LRUCacheShard::~LRUCacheShard() {}
-
-bool LRUCacheShard::Unref(LRUHandle* e) {
-  assert(e->refs > 0);
-  e->refs--;
-  return e->refs == 0;
-}
-
-// Call deleter and free
-
 void LRUCacheShard::EraseUnRefEntries() {
   autovector<LRUHandle*> last_reference_list;
   {
     MutexLock l(&mutex_);
     while (lru_.next != &lru_) {
       LRUHandle* old = lru_.next;
-      assert(old->InCache());
-      assert(old->refs ==
-             1);  // LRU list contains elements which may be evicted
+      // LRU list contains only elements which can be evicted
+      assert(old->InCache() && !old->HasRefs());
       LRU_Remove(old);
       table_.Remove(old->key(), old->hash);
       old->SetInCache(false);
-      Unref(old);
-      usage_ -= old->charge;
+      size_t total_charge = old->CalcTotalCharge(metadata_charge_policy_);
+      assert(usage_ >= total_charge);
+      usage_ -= total_charge;
       last_reference_list.push_back(old);
     }
   }
@@ -152,22 +140,27 @@ void LRUCacheShard::EraseUnRefEntries() {
 
 void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
                                            bool thread_safe) {
+  const auto applyCallback = [&]() {
+    table_.ApplyToAllCacheEntries(
+        [callback](LRUHandle* h) { callback(h->value, h->charge); });
+  };
+
   if (thread_safe) {
-    mutex_.Lock();
-  }
-  table_.ApplyToAllCacheEntries(
-      [callback](LRUHandle* h) { callback(h->value, h->charge); });
-  if (thread_safe) {
-    mutex_.Unlock();
+    MutexLock l(&mutex_);
+    applyCallback();
+  } else {
+    applyCallback();
   }
 }
 
 void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) {
+  MutexLock l(&mutex_);
   *lru = &lru_;
   *lru_low_pri = lru_low_pri_;
 }
 
 size_t LRUCacheShard::TEST_GetLRUSize() {
+  MutexLock l(&mutex_);
   LRUHandle* lru_handle = lru_.next;
   size_t lru_size = 0;
   while (lru_handle != &lru_) {
@@ -191,16 +184,19 @@ void LRUCacheShard::LRU_Remove(LRUHandle* e) {
   e->next->prev = e->prev;
   e->prev->next = e->next;
   e->prev = e->next = nullptr;
-  lru_usage_ -= e->charge;
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+  assert(lru_usage_ >= total_charge);
+  lru_usage_ -= total_charge;
   if (e->InHighPriPool()) {
-    assert(high_pri_pool_usage_ >= e->charge);
-    high_pri_pool_usage_ -= e->charge;
+    assert(high_pri_pool_usage_ >= total_charge);
+    high_pri_pool_usage_ -= total_charge;
   }
 }
 
 void LRUCacheShard::LRU_Insert(LRUHandle* e) {
   assert(e->next == nullptr);
   assert(e->prev == nullptr);
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
   if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
     // Inset "e" to head of LRU list.
     e->next = &lru_;
@@ -208,7 +204,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) {
     e->prev->next = e;
     e->next->prev = e;
     e->SetInHighPriPool(true);
-    high_pri_pool_usage_ += e->charge;
+    high_pri_pool_usage_ += total_charge;
     MaintainPoolSize();
   } else {
     // Insert "e" to the head of low-pri pool. Note that when
@@ -220,7 +216,7 @@ void LRUCacheShard::LRU_Insert(LRUHandle* e) {
     e->SetInHighPriPool(false);
     lru_low_pri_ = e;
   }
-  lru_usage_ += e->charge;
+  lru_usage_ += total_charge;
 }
 
 void LRUCacheShard::MaintainPoolSize() {
@@ -229,21 +225,25 @@ void LRUCacheShard::MaintainPoolSize() {
     lru_low_pri_ = lru_low_pri_->next;
     assert(lru_low_pri_ != &lru_);
     lru_low_pri_->SetInHighPriPool(false);
-    high_pri_pool_usage_ -= lru_low_pri_->charge;
+    size_t total_charge =
+        lru_low_pri_->CalcTotalCharge(metadata_charge_policy_);
+    assert(high_pri_pool_usage_ >= total_charge);
+    high_pri_pool_usage_ -= total_charge;
   }
 }
 
 void LRUCacheShard::EvictFromLRU(size_t charge,
                                  autovector<LRUHandle*>* deleted) {
-  while (usage_ + charge > capacity_ && lru_.next != &lru_) {
+  while ((usage_ + charge) > capacity_ && lru_.next != &lru_) {
     LRUHandle* old = lru_.next;
-    assert(old->InCache());
-    assert(old->refs == 1);  // LRU list contains elements which may be evicted
+    // LRU list contains only elements which can be evicted
+    assert(old->InCache() && !old->HasRefs());
     LRU_Remove(old);
     table_.Remove(old->key(), old->hash);
     old->SetInCache(false);
-    Unref(old);
-    usage_ -= old->charge;
+    size_t old_total_charge = old->CalcTotalCharge(metadata_charge_policy_);
+    assert(usage_ >= old_total_charge);
+    usage_ -= old_total_charge;
     deleted->push_back(old);
   }
 }
@@ -256,8 +256,8 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
     high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
     EvictFromLRU(0, &last_reference_list);
   }
-  // we free the entries here outside of mutex for
-  // performance reasons
+
+  // Free the entries outside of mutex for performance reasons
   for (auto entry : last_reference_list) {
     entry->Free();
   }
@@ -273,22 +273,22 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
   LRUHandle* e = table_.Lookup(key, hash);
   if (e != nullptr) {
     assert(e->InCache());
-    if (e->refs == 1) {
+    if (!e->HasRefs()) {
+      // The entry is in LRU since it's in hash and has no external references
       LRU_Remove(e);
     }
-    e->refs++;
+    e->Ref();
     e->SetHit();
   }
   return reinterpret_cast<Cache::Handle*>(e);
 }
 
 bool LRUCacheShard::Ref(Cache::Handle* h) {
-  LRUHandle* handle = reinterpret_cast<LRUHandle*>(h);
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(h);
   MutexLock l(&mutex_);
-  if (handle->InCache() && handle->refs == 1) {
-    LRU_Remove(handle);
-  }
-  handle->refs++;
+  // To create another reference - entry must be already externally referenced
+  assert(e->HasRefs());
+  e->Ref();
   return true;
 }
 
@@ -307,30 +307,29 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) {
   bool last_reference = false;
   {
     MutexLock l(&mutex_);
-    last_reference = Unref(e);
-    if (last_reference) {
-      usage_ -= e->charge;
-    }
-    if (e->refs == 1 && e->InCache()) {
+    last_reference = e->Unref();
+    if (last_reference && e->InCache()) {
       // The item is still in cache, and nobody else holds a reference to it
       if (usage_ > capacity_ || force_erase) {
-        // the cache is full
         // The LRU list must be empty since the cache is full
-        assert(!(usage_ > capacity_) || lru_.next == &lru_);
-        // take this opportunity and remove the item
+        assert(lru_.next == &lru_ || force_erase);
+        // Take this opportunity and remove the item
         table_.Remove(e->key(), e->hash);
         e->SetInCache(false);
-        Unref(e);
-        usage_ -= e->charge;
-        last_reference = true;
       } else {
-        // put the item on the list to be potentially freed
+        // Put the item back on the LRU list, and don't free it
         LRU_Insert(e);
+        last_reference = false;
       }
     }
+    if (last_reference) {
+      size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+      assert(usage_ >= total_charge);
+      usage_ -= total_charge;
+    }
   }
 
-  // free outside of mutex
+  // Free the entry here outside of mutex for performance reasons
   if (last_reference) {
     e->Free();
   }
@@ -346,7 +345,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   // It shouldn't happen very often though.
   LRUHandle* e = reinterpret_cast<LRUHandle*>(
       new char[sizeof(LRUHandle) - 1 + key.size()]);
-  Status s;
+  Status s = Status::OK();
   autovector<LRUHandle*> last_reference_list;
 
   e->value = value;
@@ -355,26 +354,26 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   e->key_length = key.size();
   e->flags = 0;
   e->hash = hash;
-  e->refs = (handle == nullptr
-                 ? 1
-                 : 2);  // One from LRUCache, one for the returned handle
+  e->refs = 0;
   e->next = e->prev = nullptr;
   e->SetInCache(true);
   e->SetPriority(priority);
   memcpy(e->key_data, key.data(), key.size());
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
 
   {
     MutexLock l(&mutex_);
 
     // Free the space following strict LRU policy until enough space
     // is freed or the lru list is empty
-    EvictFromLRU(charge, &last_reference_list);
+    EvictFromLRU(total_charge, &last_reference_list);
 
-    if (usage_ - lru_usage_ + charge > capacity_ &&
+    if ((usage_ + total_charge) > capacity_ &&
         (strict_capacity_limit_ || handle == nullptr)) {
       if (handle == nullptr) {
         // Don't insert the entry but still return ok, as if the entry inserted
         // into cache and get evicted immediately.
+        e->SetInCache(false);
         last_reference_list.push_back(e);
       } else {
         delete[] reinterpret_cast<char*>(e);
@@ -382,32 +381,33 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
         s = Status::Incomplete("Insert failed due to LRU cache being full.");
       }
     } else {
-      // insert into the cache
-      // note that the cache might get larger than its capacity if not enough
-      // space was freed
+      // Insert into the cache. Note that the cache might get larger than its
+      // capacity if not enough space was freed up.
       LRUHandle* old = table_.Insert(e);
-      usage_ += e->charge;
+      usage_ += total_charge;
       if (old != nullptr) {
+        assert(old->InCache());
         old->SetInCache(false);
-        if (Unref(old)) {
-          usage_ -= old->charge;
-          // old is on LRU because it's in cache and its reference count
-          // was just 1 (Unref returned 0)
+        if (!old->HasRefs()) {
+          // old is on LRU because it's in cache and its reference count is 0
           LRU_Remove(old);
+          size_t old_total_charge =
+              old->CalcTotalCharge(metadata_charge_policy_);
+          assert(usage_ >= old_total_charge);
+          usage_ -= old_total_charge;
           last_reference_list.push_back(old);
         }
       }
       if (handle == nullptr) {
         LRU_Insert(e);
       } else {
+        e->Ref();
         *handle = reinterpret_cast<Cache::Handle*>(e);
       }
-      s = Status::OK();
     }
   }
 
-  // we free the entries here outside of mutex for
-  // performance reasons
+  // Free the entries here outside of mutex for performance reasons
   for (auto entry : last_reference_list) {
     entry->Free();
   }
@@ -422,18 +422,20 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
     MutexLock l(&mutex_);
     e = table_.Remove(key, hash);
     if (e != nullptr) {
-      last_reference = Unref(e);
-      if (last_reference) {
-        usage_ -= e->charge;
-      }
-      if (last_reference && e->InCache()) {
+      assert(e->InCache());
+      e->SetInCache(false);
+      if (!e->HasRefs()) {
+        // The entry is in LRU since it's in hash and has no external references
         LRU_Remove(e);
+        size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+        assert(usage_ >= total_charge);
+        usage_ -= total_charge;
+        last_reference = true;
       }
-      e->SetInCache(false);
     }
   }
 
-  // mutex not held here
+  // Free the entry here outside of mutex for performance reasons
   // last_reference will only be true if e != nullptr
   if (last_reference) {
     e->Free();
@@ -465,7 +467,8 @@ std::string LRUCacheShard::GetPrintableOptions() const {
 LRUCache::LRUCache(size_t capacity, int num_shard_bits,
                    bool strict_capacity_limit, double high_pri_pool_ratio,
                    std::shared_ptr<MemoryAllocator> allocator,
-                   bool use_adaptive_mutex)
+                   bool use_adaptive_mutex,
+                   CacheMetadataChargePolicy metadata_charge_policy)
     : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
                    std::move(allocator)) {
   num_shards_ = 1 << num_shard_bits;
@@ -475,7 +478,7 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
   for (int i = 0; i < num_shards_; i++) {
     new (&shards_[i])
         LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio,
-            use_adaptive_mutex);
+                      use_adaptive_mutex, metadata_charge_policy);
   }
 }
 
@@ -544,15 +547,15 @@ std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
   return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
                      cache_opts.strict_capacity_limit,
                      cache_opts.high_pri_pool_ratio,
-                     cache_opts.memory_allocator,
-                     cache_opts.use_adaptive_mutex);
+                     cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+                     cache_opts.metadata_charge_policy);
 }
 
 std::shared_ptr<Cache> NewLRUCache(
     size_t capacity, int num_shard_bits, bool strict_capacity_limit,
     double high_pri_pool_ratio,
-    std::shared_ptr<MemoryAllocator> memory_allocator,
-    bool use_adaptive_mutex) {
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy) {
   if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
@@ -563,10 +566,9 @@ std::shared_ptr<Cache> NewLRUCache(
   if (num_shard_bits < 0) {
     num_shard_bits = GetDefaultCacheShardBits(capacity);
   }
-  return std::make_shared<LRUCache>(capacity, num_shard_bits,
-                                    strict_capacity_limit, high_pri_pool_ratio,
-                                    std::move(memory_allocator),
-                                    use_adaptive_mutex);
+  return std::make_shared<LRUCache>(
+      capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
+      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy);
 }
 
 }  // namespace rocksdb
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 0d9a317486e..6313c69dba9 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -12,36 +12,40 @@
 
 #include "cache/sharded_cache.h"
 
+#include "port/malloc.h"
 #include "port/port.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
 
-// LRU cache implementation
+// LRU cache implementation. This class is not thread-safe.
 
 // An entry is a variable length heap-allocated structure.
 // Entries are referenced by cache and/or by any external entity.
-// The cache keeps all its entries in table. Some elements
+// The cache keeps all its entries in a hash table. Some elements
 // are also stored on LRU list.
 //
 // LRUHandle can be in these states:
 // 1. Referenced externally AND in hash table.
-//  In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true)
-// 2. Not referenced externally and in hash table. In that case the entry is
-// in the LRU and can be freed. (refs == 1 && in_cache == true)
-// 3. Referenced externally and not in hash table. In that case the entry is
-// in not on LRU and not in table. (refs >= 1 && in_cache == false)
+//    In that case the entry is *not* in the LRU list
+//    (refs >= 1 && in_cache == true)
+// 2. Not referenced externally AND in hash table.
+//    In that case the entry is in the LRU list and can be freed.
+//    (refs == 0 && in_cache == true)
+// 3. Referenced externally AND not in hash table.
+//    In that case the entry is not in the LRU list and not in hash table.
+//    The entry can be freed when refs becomes 0.
+//    (refs >= 1 && in_cache == false)
 //
 // All newly created LRUHandles are in state 1. If you call
-// LRUCacheShard::Release
-// on entry in state 1, it will go into state 2. To move from state 1 to
-// state 3, either call LRUCacheShard::Erase or LRUCacheShard::Insert with the
-// same key.
+// LRUCacheShard::Release on entry in state 1, it will go into state 2.
+// To move from state 1 to state 3, either call LRUCacheShard::Erase or
+// LRUCacheShard::Insert with the same key (but possibly different value).
 // To move from state 2 to state 1, use LRUCacheShard::Lookup.
 // Before destruction, make sure that no handles are in state 1. This means
 // that any successful LRUCacheShard::Lookup/LRUCacheShard::Insert have a
-// matching
-// RUCache::Release (to move into state 2) or LRUCacheShard::Erase (for state 3)
+// matching LRUCache::Release (to move into state 2) or LRUCacheShard::Erase
+// (to move into state 3).
 
 struct LRUHandle {
   void* value;
@@ -51,37 +55,42 @@ struct LRUHandle {
   LRUHandle* prev;
   size_t charge;  // TODO(opt): Only allow uint32_t?
   size_t key_length;
-  uint32_t refs;     // a number of refs to this entry
-                     // cache itself is counted as 1
-
-  // Include the following flags:
-  //   IN_CACHE:         whether this entry is referenced by the hash table.
-  //   IS_HIGH_PRI:      whether this entry is high priority entry.
-  //   IN_HIGH_PRI_POOL: whether this entry is in high-pri pool.
-  //   HAS_HIT:          whether this entry has had any lookups (hits).
+  // The hash of key(). Used for fast sharding and comparisons.
+  uint32_t hash;
+  // The number of external refs to this entry. The cache itself is not counted.
+  uint32_t refs;
+
   enum Flags : uint8_t {
+    // Whether this entry is referenced by the hash table.
     IN_CACHE = (1 << 0),
+    // Whether this entry is high priority entry.
     IS_HIGH_PRI = (1 << 1),
+    // Whether this entry is in high-pri pool.
     IN_HIGH_PRI_POOL = (1 << 2),
+    // Wwhether this entry has had any lookups (hits).
     HAS_HIT = (1 << 3),
   };
 
   uint8_t flags;
 
-  uint32_t hash;     // Hash of key(); used for fast sharding and comparisons
+  // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
+  char key_data[1];
 
-  char key_data[1];  // Beginning of key
+  Slice key() const { return Slice(key_data, key_length); }
 
-  Slice key() const {
-    // For cheaper lookups, we allow a temporary Handle object
-    // to store a pointer to a key in "value".
-    if (next == this) {
-      return *(reinterpret_cast<Slice*>(value));
-    } else {
-      return Slice(key_data, key_length);
-    }
+  // Increase the reference count by 1.
+  void Ref() { refs++; }
+
+  // Just reduce the reference count by 1. Return true if it was last reference.
+  bool Unref() {
+    assert(refs > 0);
+    refs--;
+    return refs == 0;
   }
 
+  // Return true if there are external refs, false otherwise.
+  bool HasRefs() const { return refs > 0; }
+
   bool InCache() const { return flags & IN_CACHE; }
   bool IsHighPri() const { return flags & IS_HIGH_PRI; }
   bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; }
@@ -114,12 +123,28 @@ struct LRUHandle {
   void SetHit() { flags |= HAS_HIT; }
 
   void Free() {
-    assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
+    assert(refs == 0);
     if (deleter) {
       (*deleter)(key(), value);
     }
     delete[] reinterpret_cast<char*>(this);
   }
+
+  // Caclculate the memory usage by metadata
+  inline size_t CalcTotalCharge(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    assert(key_length);
+    size_t meta_charge = 0;
+    if (metadata_charge_policy == kFullChargeCacheMetadata) {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      meta_charge += malloc_usable_size(static_cast<void*>(this));
+#else
+      // This is the size that is used when a new handle is created
+      meta_charge += sizeof(LRUHandle) - 1 + key_length;
+#endif
+    }
+    return charge + meta_charge;
+  }
 };
 
 // We provide our own simple hash table since it removes a whole bunch
@@ -168,8 +193,9 @@ class LRUHandleTable {
 class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  public:
   LRUCacheShard(size_t capacity, bool strict_capacity_limit,
-                double high_pri_pool_ratio, bool use_adaptive_mutex);
-  virtual ~LRUCacheShard();
+                double high_pri_pool_ratio, bool use_adaptive_mutex,
+                CacheMetadataChargePolicy metadata_charge_policy);
+  virtual ~LRUCacheShard() override = default;
 
   // Separate from constructor so caller can easily make an array of LRUCache
   // if current usage is more than new capacity, the function will attempt to
@@ -225,10 +251,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   // high-pri pool is no larger than the size specify by high_pri_pool_pct.
   void MaintainPoolSize();
 
-  // Just reduce the reference count by 1.
-  // Return true if last reference
-  bool Unref(LRUHandle* e);
-
   // Free some space following strict LRU policy until enough space
   // to hold (usage_ + charge) is freed or the lru list is empty
   // This function is not thread safe - it needs to be executed while
@@ -293,7 +315,9 @@ class LRUCache
   LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
            double high_pri_pool_ratio,
            std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
-           bool use_adaptive_mutex = kDefaultToAdaptiveMutex);
+           bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+           CacheMetadataChargePolicy metadata_charge_policy =
+               kDontChargeCacheMetadata);
   virtual ~LRUCache();
   virtual const char* Name() const override { return "LRUCache"; }
   virtual CacheShard* GetShard(int shard) override;
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 9980dd72b7b..f4f4dee69c3 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -8,7 +8,7 @@
 #include <string>
 #include <vector>
 #include "port/port.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
@@ -31,7 +31,8 @@ class LRUCacheTest : public testing::Test {
     cache_ = reinterpret_cast<LRUCacheShard*>(
         port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
     new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/,
-                               high_pri_pool_ratio, use_adaptive_mutex);
+                               high_pri_pool_ratio, use_adaptive_mutex,
+                               kDontChargeCacheMetadata);
   }
 
   void Insert(const std::string& key,
diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc
index a48a32185bf..8fc0a7a17a3 100644
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "cache/sharded_cache.h"
 
 #include <string>
diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h
index 920898b871f..4a396bd47ff 100644
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@@ -40,6 +40,13 @@ class CacheShard {
                                       bool thread_safe) = 0;
   virtual void EraseUnRefEntries() = 0;
   virtual std::string GetPrintableOptions() const { return ""; }
+  void set_metadata_charge_policy(
+      CacheMetadataChargePolicy metadata_charge_policy) {
+    metadata_charge_policy_ = metadata_charge_policy;
+  }
+
+ protected:
+  CacheMetadataChargePolicy metadata_charge_policy_ = kDontChargeCacheMetadata;
 };
 
 // Generic cache interface which shards cache by hash of keys. 2^num_shard_bits
@@ -54,7 +61,8 @@ class ShardedCache : public Cache {
   virtual CacheShard* GetShard(int shard) = 0;
   virtual const CacheShard* GetShard(int shard) const = 0;
   virtual void* Value(Handle* handle) override = 0;
-  virtual size_t GetCharge(Handle* handle) const = 0;
+  virtual size_t GetCharge(Handle* handle) const override = 0;
+
   virtual uint32_t GetHash(Handle* handle) const = 0;
   virtual void DisownData() override = 0;
 
diff --git a/cmake/modules/FindJeMalloc.cmake b/cmake/modules/FindJeMalloc.cmake
index 7911f77c4c3..f695b3ed1b3 100644
--- a/cmake/modules/FindJeMalloc.cmake
+++ b/cmake/modules/FindJeMalloc.cmake
@@ -1,21 +1,29 @@
 # - Find JeMalloc library
 # Find the native JeMalloc includes and library
 #
-# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
-# JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
-# JEMALLOC_FOUND - True if jemalloc found.
+# JeMalloc_INCLUDE_DIRS - where to find jemalloc.h, etc.
+# JeMalloc_LIBRARIES - List of libraries when using jemalloc.
+# JeMalloc_FOUND - True if jemalloc found.
 
-find_path(JEMALLOC_INCLUDE_DIR
+find_path(JeMalloc_INCLUDE_DIRS
   NAMES jemalloc/jemalloc.h
   HINTS ${JEMALLOC_ROOT_DIR}/include)
 
-find_library(JEMALLOC_LIBRARIES
+find_library(JeMalloc_LIBRARIES
   NAMES jemalloc
   HINTS ${JEMALLOC_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR)
+find_package_handle_standard_args(JeMalloc DEFAULT_MSG JeMalloc_LIBRARIES JeMalloc_INCLUDE_DIRS)
 
 mark_as_advanced(
-  JEMALLOC_LIBRARIES
-  JEMALLOC_INCLUDE_DIR)
+  JeMalloc_LIBRARIES
+  JeMalloc_INCLUDE_DIRS)
+
+if(JeMalloc_FOUND AND NOT (TARGET JeMalloc::JeMalloc))
+  add_library (JeMalloc::JeMalloc UNKNOWN IMPORTED)
+  set_target_properties(JeMalloc::JeMalloc
+    PROPERTIES
+      IMPORTED_LOCATION ${JeMalloc_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${JeMalloc_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/FindNUMA.cmake b/cmake/modules/FindNUMA.cmake
index 02760344c68..69b95c9b60b 100644
--- a/cmake/modules/FindNUMA.cmake
+++ b/cmake/modules/FindNUMA.cmake
@@ -1,11 +1,11 @@
 # - Find NUMA
 # Find the NUMA library and includes
 #
-# NUMA_INCLUDE_DIR - where to find numa.h, etc.
+# NUMA_INCLUDE_DIRS - where to find numa.h, etc.
 # NUMA_LIBRARIES - List of libraries when using NUMA.
 # NUMA_FOUND - True if NUMA found.
 
-find_path(NUMA_INCLUDE_DIR
+find_path(NUMA_INCLUDE_DIRS
   NAMES numa.h numaif.h
   HINTS ${NUMA_ROOT_DIR}/include)
 
@@ -14,8 +14,16 @@ find_library(NUMA_LIBRARIES
   HINTS ${NUMA_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIR)
+find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIRS)
 
 mark_as_advanced(
   NUMA_LIBRARIES
-  NUMA_INCLUDE_DIR)
+  NUMA_INCLUDE_DIRS)
+
+if(NUMA_FOUND AND NOT (TARGET NUMA::NUMA))
+  add_library (NUMA::NUMA UNKNOWN IMPORTED)
+  set_target_properties(NUMA::NUMA
+    PROPERTIES
+      IMPORTED_LOCATION ${NUMA_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/FindTBB.cmake b/cmake/modules/FindTBB.cmake
index 556ce872b17..f6861fa5521 100644
--- a/cmake/modules/FindTBB.cmake
+++ b/cmake/modules/FindTBB.cmake
@@ -1,7 +1,7 @@
 # - Find TBB
 # Find the Thread Building Blocks library and includes
 #
-# TBB_INCLUDE_DIR - where to find tbb.h, etc.
+# TBB_INCLUDE_DIRS - where to find tbb.h, etc.
 # TBB_LIBRARIES - List of libraries when using TBB.
 # TBB_FOUND - True if TBB found.
 
@@ -9,17 +9,25 @@ if(NOT DEFINED TBB_ROOT_DIR)
   set(TBB_ROOT_DIR "$ENV{TBBROOT}")
 endif()
 
-find_path(TBB_INCLUDE_DIR
-NAMES tbb/tbb.h
-HINTS ${TBB_ROOT_DIR}/include)
+find_path(TBB_INCLUDE_DIRS
+  NAMES tbb/tbb.h
+  HINTS ${TBB_ROOT_DIR}/include)
 
 find_library(TBB_LIBRARIES
-NAMES tbb
-HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH)
+  NAMES tbb
+  HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIR)
+find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIRS)
 
 mark_as_advanced(
-TBB_LIBRARIES
-TBB_INCLUDE_DIR)
+  TBB_LIBRARIES
+  TBB_INCLUDE_DIRS)
+
+if(TBB_FOUND AND NOT (TARGET TBB::TBB))
+  add_library (TBB::TBB UNKNOWN IMPORTED)
+  set_target_properties(TBB::TBB
+    PROPERTIES
+      IMPORTED_LOCATION ${TBB_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${TBB_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/Findbzip2.cmake b/cmake/modules/Findbzip2.cmake
deleted file mode 100644
index 87abbe941e0..00000000000
--- a/cmake/modules/Findbzip2.cmake
+++ /dev/null
@@ -1,21 +0,0 @@
-# - Find Bzip2
-# Find the bzip2 compression library and includes
-#
-# BZIP2_INCLUDE_DIR - where to find bzlib.h, etc.
-# BZIP2_LIBRARIES - List of libraries when using bzip2.
-# BZIP2_FOUND - True if bzip2 found.
-
-find_path(BZIP2_INCLUDE_DIR
-  NAMES bzlib.h
-  HINTS ${BZIP2_ROOT_DIR}/include)
-
-find_library(BZIP2_LIBRARIES
-  NAMES bz2
-  HINTS ${BZIP2_ROOT_DIR}/lib)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(bzip2 DEFAULT_MSG BZIP2_LIBRARIES BZIP2_INCLUDE_DIR)
-
-mark_as_advanced(
-  BZIP2_LIBRARIES
-  BZIP2_INCLUDE_DIR)
diff --git a/cmake/modules/Findlz4.cmake b/cmake/modules/Findlz4.cmake
index c34acef5e39..7cf7d7f5fe3 100644
--- a/cmake/modules/Findlz4.cmake
+++ b/cmake/modules/Findlz4.cmake
@@ -1,21 +1,29 @@
 # - Find Lz4
 # Find the lz4 compression library and includes
 #
-# LZ4_INCLUDE_DIR - where to find lz4.h, etc.
-# LZ4_LIBRARIES - List of libraries when using lz4.
-# LZ4_FOUND - True if lz4 found.
+# lz4_INCLUDE_DIRS - where to find lz4.h, etc.
+# lz4_LIBRARIES - List of libraries when using lz4.
+# lz4_FOUND - True if lz4 found.
 
-find_path(LZ4_INCLUDE_DIR
+find_path(lz4_INCLUDE_DIRS
   NAMES lz4.h
-  HINTS ${LZ4_ROOT_DIR}/include)
+  HINTS ${lz4_ROOT_DIR}/include)
 
-find_library(LZ4_LIBRARIES
+find_library(lz4_LIBRARIES
   NAMES lz4
-  HINTS ${LZ4_ROOT_DIR}/lib)
+  HINTS ${lz4_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(lz4 DEFAULT_MSG LZ4_LIBRARIES LZ4_INCLUDE_DIR)
+find_package_handle_standard_args(lz4 DEFAULT_MSG lz4_LIBRARIES lz4_INCLUDE_DIRS)
 
 mark_as_advanced(
-  LZ4_LIBRARIES
-  LZ4_INCLUDE_DIR)
+  lz4_LIBRARIES
+  lz4_INCLUDE_DIRS)
+
+if(lz4_FOUND AND NOT (TARGET lz4::lz4))
+  add_library(lz4::lz4 UNKNOWN IMPORTED)
+  set_target_properties(lz4::lz4
+    PROPERTIES
+      IMPORTED_LOCATION ${lz4_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${lz4_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/Findsnappy.cmake b/cmake/modules/Findsnappy.cmake
index 6ed5fda3d57..39bba6bd217 100644
--- a/cmake/modules/Findsnappy.cmake
+++ b/cmake/modules/Findsnappy.cmake
@@ -1,21 +1,29 @@
 # - Find Snappy
 # Find the snappy compression library and includes
 #
-# SNAPPY_INCLUDE_DIR - where to find snappy.h, etc.
-# SNAPPY_LIBRARIES - List of libraries when using snappy.
-# SNAPPY_FOUND - True if snappy found.
+# snappy_INCLUDE_DIRS - where to find snappy.h, etc.
+# snappy_LIBRARIES - List of libraries when using snappy.
+# snappy_FOUND - True if snappy found.
 
-find_path(SNAPPY_INCLUDE_DIR
+find_path(snappy_INCLUDE_DIRS
   NAMES snappy.h
-  HINTS ${SNAPPY_ROOT_DIR}/include)
+  HINTS ${snappy_ROOT_DIR}/include)
 
-find_library(SNAPPY_LIBRARIES
+find_library(snappy_LIBRARIES
   NAMES snappy
-  HINTS ${SNAPPY_ROOT_DIR}/lib)
+  HINTS ${snappy_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(snappy DEFAULT_MSG SNAPPY_LIBRARIES SNAPPY_INCLUDE_DIR)
+find_package_handle_standard_args(snappy DEFAULT_MSG snappy_LIBRARIES snappy_INCLUDE_DIRS)
 
 mark_as_advanced(
-  SNAPPY_LIBRARIES
-  SNAPPY_INCLUDE_DIR)
+  snappy_LIBRARIES
+  snappy_INCLUDE_DIRS)
+
+if(snappy_FOUND AND NOT (TARGET snappy::snappy))
+  add_library (snappy::snappy UNKNOWN IMPORTED)
+  set_target_properties(snappy::snappy
+    PROPERTIES
+      IMPORTED_LOCATION ${snappy_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${snappy_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/Findzstd.cmake b/cmake/modules/Findzstd.cmake
index a2964aa9f80..9430821df6e 100644
--- a/cmake/modules/Findzstd.cmake
+++ b/cmake/modules/Findzstd.cmake
@@ -1,21 +1,29 @@
 # - Find zstd
 # Find the zstd compression library and includes
 #
-# ZSTD_INCLUDE_DIR - where to find zstd.h, etc.
-# ZSTD_LIBRARIES - List of libraries when using zstd.
-# ZSTD_FOUND - True if zstd found.
+# zstd_INCLUDE_DIRS - where to find zstd.h, etc.
+# zstd_LIBRARIES - List of libraries when using zstd.
+# zstd_FOUND - True if zstd found.
 
-find_path(ZSTD_INCLUDE_DIR
+find_path(zstd_INCLUDE_DIRS
   NAMES zstd.h
-  HINTS ${ZSTD_ROOT_DIR}/include)
+  HINTS ${zstd_ROOT_DIR}/include)
 
-find_library(ZSTD_LIBRARIES
+find_library(zstd_LIBRARIES
   NAMES zstd
-  HINTS ${ZSTD_ROOT_DIR}/lib)
+  HINTS ${zstd_ROOT_DIR}/lib)
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(zstd DEFAULT_MSG ZSTD_LIBRARIES ZSTD_INCLUDE_DIR)
+find_package_handle_standard_args(zstd DEFAULT_MSG zstd_LIBRARIES zstd_INCLUDE_DIRS)
 
 mark_as_advanced(
-  ZSTD_LIBRARIES
-  ZSTD_INCLUDE_DIR)
+  zstd_LIBRARIES
+  zstd_INCLUDE_DIRS)
+
+if(zstd_FOUND AND NOT (TARGET zstd::zstd))
+  add_library (zstd::zstd UNKNOWN IMPORTED)
+  set_target_properties(zstd::zstd
+    PROPERTIES
+      IMPORTED_LOCATION ${zstd_LIBRARIES}
+      INTERFACE_INCLUDE_DIRECTORIES ${zstd_INCLUDE_DIRS})
+endif()
diff --git a/cmake/modules/ReadVersion.cmake b/cmake/modules/ReadVersion.cmake
new file mode 100644
index 00000000000..ebfd7d6f949
--- /dev/null
+++ b/cmake/modules/ReadVersion.cmake
@@ -0,0 +1,10 @@
+# Read rocksdb version from version.h header file.
+
+function(get_rocksdb_version version_var)
+  file(READ "${CMAKE_CURRENT_SOURCE_DIR}/include/rocksdb/version.h" version_header_file)
+  foreach(component MAJOR MINOR PATCH)
+    string(REGEX MATCH "#define ROCKSDB_${component} ([0-9]+)" _ ${version_header_file})
+    set(ROCKSDB_VERSION_${component} ${CMAKE_MATCH_1})
+  endforeach()
+  set(${version_var} "${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH}" PARENT_SCOPE)
+endfunction()
diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py
index fbdabd96839..a5e98722202 100644
--- a/coverage/parse_gcov_output.py
+++ b/coverage/parse_gcov_output.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python2
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import re
 import sys
diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
new file mode 100644
index 00000000000..840c20e9e4c
--- /dev/null
+++ b/db/arena_wrapped_db_iter.cc
@@ -0,0 +1,106 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/arena_wrapped_db_iter.h"
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace rocksdb {
+
+Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
+                                       std::string* prop) {
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    if (!db_iter_->GetProperty(prop_name, prop).ok()) {
+      *prop = ToString(sv_number_);
+    }
+    return Status::OK();
+  }
+  return db_iter_->GetProperty(prop_name, prop);
+}
+
+void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
+                              const ImmutableCFOptions& cf_options,
+                              const MutableCFOptions& mutable_cf_options,
+                              const SequenceNumber& sequence,
+                              uint64_t max_sequential_skip_in_iteration,
+                              uint64_t version_number,
+                              ReadCallback* read_callback, DBImpl* db_impl,
+                              ColumnFamilyData* cfd, bool allow_blob,
+                              bool allow_refresh) {
+  auto mem = arena_.AllocateAligned(sizeof(DBIter));
+  db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
+                              cf_options.user_comparator, nullptr, sequence,
+                              true, max_sequential_skip_in_iteration,
+                              read_callback, db_impl, cfd, allow_blob);
+  sv_number_ = version_number;
+  allow_refresh_ = allow_refresh;
+}
+
+Status ArenaWrappedDBIter::Refresh() {
+  if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
+    return Status::NotSupported("Creating renew iterator is not allowed.");
+  }
+  assert(db_iter_ != nullptr);
+  // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
+  // correct behavior. Will be corrected automatically when we take a snapshot
+  // here for the case of WritePreparedTxnDB.
+  SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+  uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
+  if (sv_number_ != cur_sv_number) {
+    Env* env = db_iter_->env();
+    db_iter_->~DBIter();
+    arena_.~Arena();
+    new (&arena_) Arena();
+
+    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex());
+    if (read_callback_) {
+      read_callback_->Refresh(latest_seq);
+    }
+    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+         latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+         cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
+         allow_refresh_);
+
+    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
+        latest_seq);
+    SetIterUnderDBIter(internal_iter);
+  } else {
+    db_iter_->set_sequence(latest_seq);
+    db_iter_->set_valid(false);
+  }
+  return Status::OK();
+}
+
+ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+    ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+    bool allow_blob, bool allow_refresh) {
+  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
+  iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
+             max_sequential_skip_in_iterations, version_number, read_callback,
+             db_impl, cfd, allow_blob, allow_refresh);
+  if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
+    iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback,
+                           allow_blob);
+  }
+
+  return iter;
+}
+
+}  // namespace rocksdb
diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h
new file mode 100644
index 00000000000..6dbd64521be
--- /dev/null
+++ b/db/arena_wrapped_db_iter.h
@@ -0,0 +1,112 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+#include <string>
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+class Arena;
+
+// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
+// iterator is supposed to be allocated. This class is used as an entry point of
+// a iterator hierarchy whose memory can be allocated inline. In that way,
+// accessing the iterator tree can be more cache friendly. It is also faster
+// to allocate.
+// When using the class's Iterator interface, the behavior is exactly
+// the same as the inner DBIter.
+class ArenaWrappedDBIter : public Iterator {
+ public:
+  virtual ~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
+
+  // Get the arena to be used to allocate memory for DBIter to be wrapped,
+  // as well as child iterators in it.
+  virtual Arena* GetArena() { return &arena_; }
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
+    return db_iter_->GetRangeDelAggregator();
+  }
+
+  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
+  // a merging iterator.
+  virtual void SetIterUnderDBIter(InternalIterator* iter) {
+    static_cast<DBIter*>(db_iter_)->SetIter(iter);
+  }
+
+  virtual bool Valid() const override { return db_iter_->Valid(); }
+  virtual void SeekToFirst() override { db_iter_->SeekToFirst(); }
+  virtual void SeekToLast() override { db_iter_->SeekToLast(); }
+  virtual void Seek(const Slice& target) override { db_iter_->Seek(target); }
+  virtual void SeekForPrev(const Slice& target) override {
+    db_iter_->SeekForPrev(target);
+  }
+  virtual void Next() override { db_iter_->Next(); }
+  virtual void Prev() override { db_iter_->Prev(); }
+  virtual Slice key() const override { return db_iter_->key(); }
+  virtual Slice value() const override { return db_iter_->value(); }
+  virtual Status status() const override { return db_iter_->status(); }
+  bool IsBlob() const { return db_iter_->IsBlob(); }
+
+  virtual Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  virtual Status Refresh() override;
+
+  void Init(Env* env, const ReadOptions& read_options,
+            const ImmutableCFOptions& cf_options,
+            const MutableCFOptions& mutable_cf_options,
+            const SequenceNumber& sequence,
+            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+            ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+            bool allow_blob, bool allow_refresh);
+
+  // Store some parameters so we can refresh the iterator at a later point
+  // with these same params
+  void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
+                        ColumnFamilyData* cfd, ReadCallback* read_callback,
+                        bool allow_blob) {
+    read_options_ = read_options;
+    db_impl_ = db_impl;
+    cfd_ = cfd;
+    read_callback_ = read_callback;
+    allow_blob_ = allow_blob;
+  }
+
+ private:
+  DBIter* db_iter_;
+  Arena arena_;
+  uint64_t sv_number_;
+  ColumnFamilyData* cfd_ = nullptr;
+  DBImpl* db_impl_ = nullptr;
+  ReadOptions read_options_;
+  ReadCallback* read_callback_;
+  bool allow_blob_ = false;
+  bool allow_refresh_ = true;
+};
+
+// Generate the arena wrapped iterator class.
+// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
+// be supported.
+extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
+    ColumnFamilyData* cfd = nullptr, bool allow_blob = false,
+    bool allow_refresh = true);
+}  // namespace rocksdb
diff --git a/utilities/blob_db/blob_index.h b/db/blob_index.h
similarity index 92%
rename from utilities/blob_db/blob_index.h
rename to db/blob_index.h
index fd91b547a84..e1d41e27410 100644
--- a/utilities/blob_db/blob_index.h
+++ b/db/blob_index.h
@@ -5,12 +5,14 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
+#include <sstream>
+#include <string>
+
 #include "rocksdb/options.h"
 #include "util/coding.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
-namespace blob_db {
 
 // BlobIndex is a pointer to the blob and metadata of the blob. The index is
 // stored in base DB as ValueType::kTypeBlobIndex.
@@ -109,6 +111,23 @@ class BlobIndex {
     return Status::OK();
   }
 
+  std::string DebugString(bool output_hex) {
+    std::ostringstream oss;
+
+    if (IsInlined()) {
+      oss << "[inlined blob] value:" << value_.ToString(output_hex);
+    } else {
+      oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
+          << " size:" << size_;
+    }
+
+    if (HasTTL()) {
+      oss << " exp:" << expiration_;
+    }
+
+    return oss.str();
+  }
+
   static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
                                const Slice& value) {
     assert(dst != nullptr);
@@ -156,6 +175,5 @@ class BlobIndex {
   CompressionType compression_ = kNoCompression;
 };
 
-}  // namespace blob_db
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/db/builder.cc b/db/builder.cc
index 7f2fd72a191..e5d08a0725f 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -13,7 +13,7 @@
 #include <deque>
 #include <vector>
 
-#include "db/compaction_iterator.h"
+#include "db/compaction/compaction_iterator.h"
 #include "db/dbformat.h"
 #include "db/event_helpers.h"
 #include "db/internal_stats.h"
@@ -21,6 +21,9 @@
 #include "db/range_del_aggregator.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_util.h"
 #include "rocksdb/db.h"
@@ -28,13 +31,11 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_builder.h"
+#include "table/block_based/block_based_table_builder.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
-#include "util/file_reader_writer.h"
-#include "util/filename.h"
+#include "test_util/sync_point.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -123,7 +124,7 @@ Status BuildTable(
       if (!s.ok()) {
         EventHelpers::LogAndNotifyTableFileCreationFinished(
             event_logger, ioptions.listeners, dbname, column_family_name, fname,
-            job_id, meta->fd, tp, reason, s);
+            job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s);
         return s;
       }
       file->SetIOPriority(io_priority);
@@ -156,8 +157,9 @@ Status BuildTable(
     for (; c_iter.Valid(); c_iter.Next()) {
       const Slice& key = c_iter.key();
       const Slice& value = c_iter.value();
+      const ParsedInternalKey& ikey = c_iter.ikey();
       builder->Add(key, value);
-      meta->UpdateBoundaries(key, c_iter.ikey().sequence);
+      meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
 
       // TODO(noetzli): Update stats after flush, too.
       if (io_priority == Env::IO_HIGH &&
@@ -221,8 +223,9 @@ Status BuildTable(
           mutable_cf_options.prefix_extractor.get(), nullptr,
           (internal_stats == nullptr) ? nullptr
                                       : internal_stats->GetFileReadHist(0),
-          false /* for_compaction */, nullptr /* arena */,
-          false /* skip_filter */, level));
+          TableReaderCaller::kFlush, /*arena=*/nullptr,
+          /*skip_filter=*/false, level, /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key*/ nullptr));
       s = it->status();
       if (s.ok() && paranoid_file_checks) {
         for (it->SeekToFirst(); it->Valid(); it->Next()) {
@@ -241,10 +244,13 @@ Status BuildTable(
     env->DeleteFile(fname);
   }
 
+  if (meta->fd.GetFileSize() == 0) {
+    fname = "(nil)";
+  }
   // Output to event logger and fire events.
   EventHelpers::LogAndNotifyTableFileCreationFinished(
       event_logger, ioptions.listeners, dbname, column_family_name, fname,
-      job_id, meta->fd, tp, reason, s);
+      job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s);
 
   return s;
 }
diff --git a/db/builder.h b/db/builder.h
index 34a4bff1a25..4fa56f50e34 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -11,6 +11,7 @@
 #include <vector>
 #include "db/range_tombstone_fragmenter.h"
 #include "db/table_properties_collector.h"
+#include "logging/event_logger.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
@@ -20,7 +21,6 @@
 #include "rocksdb/table_properties.h"
 #include "rocksdb/types.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/event_logger.h"
 
 namespace rocksdb {
 
diff --git a/db/c.cc b/db/c.cc
index aac1cf4087c..76007e9175a 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -517,6 +517,21 @@ rocksdb_t* rocksdb_open_for_read_only(
   return result;
 }
 
+rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
+                                     const char* name,
+                                     const char* secondary_path,
+                                     char** errptr) {
+  DB* db;
+  if (SaveError(errptr,
+                DB::OpenAsSecondary(options->rep, std::string(name),
+                                    std::string(secondary_path), &db))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
 rocksdb_backup_engine_t* rocksdb_backup_engine_open(
     const rocksdb_options_t* options, const char* path, char** errptr) {
   BackupEngine* be;
@@ -717,6 +732,37 @@ rocksdb_t* rocksdb_open_for_read_only_column_families(
   return result;
 }
 
+rocksdb_t* rocksdb_open_as_secondary_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    const char* secondary_path, int num_column_families,
+    const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i != num_column_families; ++i) {
+    column_families.emplace_back(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep));
+  }
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
+                                            std::string(name),
+                                            std::string(secondary_path),
+                                            column_families, &handles, &db))) {
+    return nullptr;
+  }
+  for (size_t i = 0; i != handles.size(); ++i) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
 char** rocksdb_list_column_families(
     const rocksdb_options_t* options,
     const char* name,
@@ -988,7 +1034,7 @@ void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) {
 rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) {
   rocksdb_writebatch_t* result = rocksdb_writebatch_create();
   BatchResult wal_batch = iter->rep->GetBatch();
-  result->rep = * wal_batch.writeBatchPtr.release();
+  result->rep = std::move(*wal_batch.writeBatchPtr);
   if (seq != nullptr) {
     *seq = wal_batch.sequence;
   }
@@ -2226,11 +2272,6 @@ void rocksdb_options_set_max_bytes_for_level_base(
   opt->rep.max_bytes_for_level_base = n;
 }
 
-void rocksdb_options_set_snap_refresh_nanos(rocksdb_options_t* opt,
-                                            uint64_t n) {
-  opt->rep.snap_refresh_nanos = n;
-}
-
 void rocksdb_options_set_level_compaction_dynamic_level_bytes(
     rocksdb_options_t* opt, unsigned char v) {
   opt->rep.level_compaction_dynamic_level_bytes = v;
@@ -2468,11 +2509,21 @@ void rocksdb_options_set_max_write_buffer_number_to_maintain(
   opt->rep.max_write_buffer_number_to_maintain = n;
 }
 
+void rocksdb_options_set_max_write_buffer_size_to_maintain(
+    rocksdb_options_t* opt, int64_t n) {
+  opt->rep.max_write_buffer_size_to_maintain = n;
+}
+
 void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
                                                 unsigned char v) {
   opt->rep.enable_pipelined_write = v;
 }
 
+void rocksdb_options_set_unordered_write(rocksdb_options_t* opt,
+                                         unsigned char v) {
+  opt->rep.unordered_write = v;
+}
+
 void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
                                             uint32_t n) {
   opt->rep.max_subcompactions = n;
@@ -3168,6 +3219,11 @@ void rocksdb_writeoptions_set_low_pri(
   opt->rep.low_pri = v;
 }
 
+void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.memtable_insert_hint_per_batch = v;
+}
+
 rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
   return new rocksdb_compactoptions_t;
 }
@@ -3268,6 +3324,22 @@ void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
   env->rep->WaitForJoin();
 }
 
+void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority(Env::HIGH);
+}
+
+void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority(Env::HIGH);
+}
+
 void rocksdb_env_destroy(rocksdb_env_t* env) {
   if (!env->is_default) delete env->rep;
   delete env;
@@ -3402,6 +3474,10 @@ void rocksdb_ingest_external_file_cf(
   SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep));
 }
 
+void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) {
+  SaveError(errptr, db->rep->TryCatchUpWithPrimary());
+}
+
 rocksdb_slicetransform_t* rocksdb_slicetransform_create(
     void* state,
     void (*destructor)(void*),
diff --git a/db/c_test.c b/db/c_test.c
index 64241df287b..e851aad53f2 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -45,6 +45,7 @@ static char sstfilename[200];
 static char dbbackupname[200];
 static char dbcheckpointname[200];
 static char dbpathname[200];
+static char secondary_path[200];
 
 static void StartPhase(const char* name) {
   fprintf(stderr, "=== Test %s\n", name);
@@ -1466,6 +1467,7 @@ int main(int argc, char** argv) {
        CheckCondition(!rocksdb_iter_valid(iter));
 
        rocksdb_iter_destroy(iter);
+       rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
     }
   }
 
@@ -1722,6 +1724,59 @@ int main(int argc, char** argv) {
     CheckNoError(err);
   }
 
+  // Check that secondary instance works.
+  StartPhase("open_as_secondary");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    db = rocksdb_open(db_options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_t* db1;
+    rocksdb_options_t* opts = rocksdb_options_create();
+    rocksdb_options_set_max_open_files(opts, -1);
+    rocksdb_options_set_create_if_missing(opts, 1);
+    snprintf(secondary_path, sizeof(secondary_path),
+             "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid()));
+    db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    rocksdb_writeoptions_disable_WAL(woptions, 1);
+    rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err);
+    CheckNoError(err);
+    rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_opts, 1);
+    rocksdb_flush(db, flush_opts, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+    rocksdb_readoptions_set_verify_checksums(ropts, 1);
+    rocksdb_readoptions_set_snapshot(ropts, NULL);
+    CheckGet(db, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key0", "value0");
+
+    rocksdb_writeoptions_disable_WAL(woptions, 0);
+    rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    CheckGet(db1, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key1", "value1");
+
+    rocksdb_close(db1);
+    rocksdb_destroy_db(opts, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_options_destroy(db_options);
+    rocksdb_options_destroy(opts);
+    rocksdb_readoptions_destroy(ropts);
+    rocksdb_flushoptions_destroy(flush_opts);
+  }
+
   // Simple sanity check that options setting db_paths work.
   StartPhase("open_db_paths");
   {
diff --git a/db/column_family.cc b/db/column_family.cc
index 4592c945f2e..f66759818e8 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -9,34 +9,32 @@
 
 #include "db/column_family.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
-#include <vector>
-#include <string>
 #include <algorithm>
+#include <cinttypes>
 #include <limits>
+#include <string>
+#include <vector>
 
-#include "db/compaction_picker.h"
-#include "db/compaction_picker_fifo.h"
-#include "db/compaction_picker_universal.h"
-#include "db/db_impl.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/db_impl/db_impl.h"
 #include "db/internal_stats.h"
 #include "db/job_context.h"
 #include "db/range_del_aggregator.h"
 #include "db/table_properties_collector.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
+#include "file/sst_file_manager_impl.h"
 #include "memtable/hash_skiplist_rep.h"
 #include "monitoring/thread_status_util.h"
 #include "options/options_helper.h"
-#include "table/block_based_table_factory.h"
+#include "port/port.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "util/autovector.h"
 #include "util/compression.h"
-#include "util/sst_file_manager_impl.h"
 
 namespace rocksdb {
 
@@ -190,6 +188,11 @@ Status CheckCFPathsSupported(const DBOptions& db_options,
   return Status::OK();
 }
 
+namespace {
+const uint64_t kDefaultTtl = 0xfffffffffffffffe;
+const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
+};  // namespace
+
 ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
                                     const ColumnFamilyOptions& src) {
   ColumnFamilyOptions result = src;
@@ -230,7 +233,14 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
   if (result.max_write_buffer_number < 2) {
     result.max_write_buffer_number = 2;
   }
-  if (result.max_write_buffer_number_to_maintain < 0) {
+  // fall back max_write_buffer_number_to_maintain if
+  // max_write_buffer_size_to_maintain is not set
+  if (result.max_write_buffer_size_to_maintain < 0) {
+    result.max_write_buffer_size_to_maintain =
+        result.max_write_buffer_number *
+        static_cast<int64_t>(result.write_buffer_size);
+  } else if (result.max_write_buffer_size_to_maintain == 0 &&
+             result.max_write_buffer_number_to_maintain < 0) {
     result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
   }
   // bloom filter size shouldn't exceed 1/4 of memtable size.
@@ -338,6 +348,61 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
     result.max_compaction_bytes = result.target_file_size_base * 25;
   }
 
+  bool is_block_based_table =
+      (result.table_factory->Name() == BlockBasedTableFactory().Name());
+
+  const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
+  if (result.ttl == kDefaultTtl) {
+    if (is_block_based_table &&
+        result.compaction_style != kCompactionStyleFIFO) {
+      result.ttl = kAdjustedTtl;
+    } else {
+      result.ttl = 0;
+    }
+  }
+
+  const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
+
+  // Turn on periodic compactions and set them to occur once every 30 days if
+  // compaction filters are used and periodic_compaction_seconds is set to the
+  // default value.
+  if (result.compaction_style != kCompactionStyleFIFO) {
+    if ((result.compaction_filter != nullptr ||
+         result.compaction_filter_factory != nullptr) &&
+        result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
+        is_block_based_table) {
+      result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+    }
+  } else {
+    // result.compaction_style == kCompactionStyleFIFO
+    if (result.ttl == 0) {
+      if (is_block_based_table) {
+        if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+          result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+        }
+        result.ttl = result.periodic_compaction_seconds;
+      }
+    } else if (result.periodic_compaction_seconds != 0) {
+      result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
+    }
+  }
+
+  // TTL compactions would work similar to Periodic Compactions in Universal in
+  // most of the cases. So, if ttl is set, execute the periodic compaction
+  // codepath.
+  if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
+    if (result.periodic_compaction_seconds != 0) {
+      result.periodic_compaction_seconds =
+          std::min(result.ttl, result.periodic_compaction_seconds);
+    } else {
+      result.periodic_compaction_seconds = result.ttl;
+    }
+  }
+
+  if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+    result.periodic_compaction_seconds = 0;
+  }
+
   return result;
 }
 
@@ -408,7 +473,8 @@ ColumnFamilyData::ColumnFamilyData(
     uint32_t id, const std::string& name, Version* _dummy_versions,
     Cache* _table_cache, WriteBufferManager* write_buffer_manager,
     const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
-    const EnvOptions& env_options, ColumnFamilySet* column_family_set)
+    const EnvOptions& env_options, ColumnFamilySet* column_family_set,
+    BlockCacheTracer* const block_cache_tracer)
     : id_(id),
       name_(name),
       dummy_versions_(_dummy_versions),
@@ -425,7 +491,8 @@ ColumnFamilyData::ColumnFamilyData(
       write_buffer_manager_(write_buffer_manager),
       mem_(nullptr),
       imm_(ioptions_.min_write_buffer_number_to_merge,
-           ioptions_.max_write_buffer_number_to_maintain),
+           ioptions_.max_write_buffer_number_to_maintain,
+           ioptions_.max_write_buffer_size_to_maintain),
       super_version_(nullptr),
       super_version_number_(0),
       local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
@@ -448,7 +515,8 @@ ColumnFamilyData::ColumnFamilyData(
   if (_dummy_versions != nullptr) {
     internal_stats_.reset(
         new InternalStats(ioptions_.num_levels, db_options.env, this));
-    table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache));
+    table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache,
+                                      block_cache_tracer));
     if (ioptions_.compaction_style == kCompactionStyleLevel) {
       compaction_picker_.reset(
           new LevelCompactionPicker(ioptions_, &internal_comparator_));
@@ -922,8 +990,12 @@ bool ColumnFamilyData::NeedsCompaction() const {
 
 Compaction* ColumnFamilyData::PickCompaction(
     const MutableCFOptions& mutable_options, LogBuffer* log_buffer) {
+  SequenceNumber earliest_mem_seqno =
+      std::min(mem_->GetEarliestSequenceNumber(),
+               imm_.current()->GetEarliestSequenceNumber(false));
   auto* result = compaction_picker_->PickCompaction(
-      GetName(), mutable_options, current_->storage_info(), log_buffer);
+      GetName(), mutable_options, current_->storage_info(), log_buffer,
+      earliest_mem_seqno);
   if (result != nullptr) {
     result->SetInputVersion(current_);
   }
@@ -1147,13 +1219,51 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
   }
 }
 
+Status ColumnFamilyData::ValidateOptions(
+    const DBOptions& db_options, const ColumnFamilyOptions& cf_options) {
+  Status s;
+  s = CheckCompressionSupported(cf_options);
+  if (s.ok() && db_options.allow_concurrent_memtable_write) {
+    s = CheckConcurrentWritesSupported(cf_options);
+  }
+  if (s.ok()) {
+    s = CheckCFPathsSupported(db_options, cf_options);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) {
+    if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+      return Status::NotSupported(
+          "TTL is only supported in Block-Based Table format. ");
+    }
+  }
+
+  if (cf_options.periodic_compaction_seconds > 0 &&
+      cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
+    if (cf_options.table_factory->Name() != BlockBasedTableFactory().Name()) {
+      return Status::NotSupported(
+          "Periodic Compaction is only supported in "
+          "Block-Based Table format. ");
+    }
+  }
+  return s;
+}
+
 #ifndef ROCKSDB_LITE
 Status ColumnFamilyData::SetOptions(
-      const std::unordered_map<std::string, std::string>& options_map) {
+    const DBOptions& db_options,
+    const std::unordered_map<std::string, std::string>& options_map) {
   MutableCFOptions new_mutable_cf_options;
   Status s =
       GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
                                    ioptions_.info_log, &new_mutable_cf_options);
+  if (s.ok()) {
+    ColumnFamilyOptions cf_options =
+        BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options);
+    s = ValidateOptions(db_options, cf_options);
+  }
   if (s.ok()) {
     mutable_cf_options_ = new_mutable_cf_options;
     mutable_cf_options_.RefreshDerivedOptions(ioptions_);
@@ -1210,18 +1320,20 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  const EnvOptions& env_options,
                                  Cache* table_cache,
                                  WriteBufferManager* write_buffer_manager,
-                                 WriteController* write_controller)
+                                 WriteController* write_controller,
+                                 BlockCacheTracer* const block_cache_tracer)
     : max_column_family_(0),
-      dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr,
-                                      ColumnFamilyOptions(), *db_options,
-                                      env_options, nullptr)),
+      dummy_cfd_(new ColumnFamilyData(
+          0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options,
+          env_options, nullptr, block_cache_tracer)),
       default_cfd_cache_(nullptr),
       db_name_(dbname),
       db_options_(db_options),
       env_options_(env_options),
       table_cache_(table_cache),
       write_buffer_manager_(write_buffer_manager),
-      write_controller_(write_controller) {
+      write_controller_(write_controller),
+      block_cache_tracer_(block_cache_tracer) {
   // initialize linked list
   dummy_cfd_->prev_ = dummy_cfd_;
   dummy_cfd_->next_ = dummy_cfd_;
@@ -1289,7 +1401,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
   assert(column_families_.find(name) == column_families_.end());
   ColumnFamilyData* new_cfd = new ColumnFamilyData(
       id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
-      *db_options_, env_options_, this);
+      *db_options_, env_options_, this, block_cache_tracer_);
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
   max_column_family_ = std::max(max_column_family_, id);
diff --git a/db/column_family.h b/db/column_family.h
index 7a1ae85bfd3..135504ea21b 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -24,6 +24,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
+#include "trace_replay/block_cache_tracer.h"
 #include "util/thread_local.h"
 
 namespace rocksdb {
@@ -45,6 +46,112 @@ class InstrumentedMutexLock;
 struct SuperVersionContext;
 
 extern const double kIncSlowdownRatio;
+// This file contains a list of data structures for managing column family
+// level metadata.
+//
+// The basic relationships among classes declared here are illustrated as
+// following:
+//
+//       +----------------------+    +----------------------+   +--------+
+//   +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 |   | DBImpl |
+//   |   +----------------------+ |  +----------------------+   +----+---+
+//   | +--------------------------+                                  |
+//   | |                               +-----------------------------+
+//   | |                               |
+//   | | +-----------------------------v-------------------------------+
+//   | | |                                                             |
+//   | | |                      ColumnFamilySet                        |
+//   | | |                                                             |
+//   | | +-------------+--------------------------+----------------+---+
+//   | |               |                          |                |
+//   | +-------------------------------------+    |                |
+//   |                 |                     |    |                v
+//   |   +-------------v-------------+ +-----v----v---------+
+//   |   |                           | |                    |
+//   |   |     ColumnFamilyData 1    | | ColumnFamilyData 2 |    ......
+//   |   |                           | |                    |
+//   +--->                           | |                    |
+//       |                 +---------+ |                    |
+//       |                 | MemTable| |                    |
+//       |                 |  List   | |                    |
+//       +--------+---+--+-+----+----+ +--------------------++
+//                |   |  |      |
+//                |   |  |      |
+//                |   |  |      +-----------------------+
+//                |   |  +-----------+                  |
+//                v   +--------+     |                  |
+//       +--------+--------+   |     |                  |
+//       |                 |   |     |       +----------v----------+
+// +---> |SuperVersion 1.a +----------------->                     |
+//       |                 +------+  |       | MemTableListVersion |
+//       +---+-------------+   |  |  |       |                     |
+//           |                 |  |  |       +----+------------+---+
+//           |      current    |  |  |            |            |
+//           |   +-------------+  |  |mem         |            |
+//           |   |                |  |            |            |
+//         +-v---v-------+    +---v--v---+  +-----v----+  +----v-----+
+//         |             |    |          |  |          |  |          |
+//         | Version 1.a |    | memtable |  | memtable |  | memtable |
+//         |             |    |   1.a    |  |   1.b    |  |   1.c    |
+//         +-------------+    |          |  |          |  |          |
+//                            +----------+  +----------+  +----------+
+//
+// DBImpl keeps a ColumnFamilySet, which references to all column families by
+// pointing to respective ColumnFamilyData object of each column family.
+// This is how DBImpl can list and operate on all the column families.
+// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
+// when a user executes a query, it can directly find memtables and Version
+// as well as SuperVersion to the column family, without going through
+// ColumnFamilySet.
+//
+// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
+// and SST files) indirectly, while ongoing operations may hold references
+// to a current or an out-of-date SuperVersion, which in turn points to a
+// point-in-time view of the LSM-tree. This guarantees the memtables and SST
+// files being operated on will not go away, until the SuperVersion is
+// unreferenced to 0 and destoryed.
+//
+// The following graph illustrates a possible referencing relationships:
+//
+// Column       +--------------+      current       +-----------+
+// Family +---->+              +------------------->+           |
+//  Data        | SuperVersion +----------+         | Version A |
+//              |      3       |   imm    |         |           |
+// Iter2 +----->+              |  +-------v------+  +-----------+
+//              +-----+--------+  | MemtableList +----------------> Empty
+//                    |           |   Version r  |  +-----------+
+//                    |           +--------------+  |           |
+//                    +------------------+   current| Version B |
+//              +--------------+         |   +----->+           |
+//              |              |         |   |      +-----+-----+
+// Compaction +>+ SuperVersion +-------------+            ^
+//    Job       |      2       +------+  |                |current
+//              |              +----+ |  |     mem        |    +------------+
+//              +--------------+    | |  +--------------------->            |
+//                                  | +------------------------> MemTable a |
+//                                  |          mem        |    |            |
+//              +--------------+    |                     |    +------------+
+//              |              +--------------------------+
+//  Iter1 +-----> SuperVersion |    |                          +------------+
+//              |      1       +------------------------------>+            |
+//              |              +-+  |        mem               | MemTable b |
+//              +--------------+ |  |                          |            |
+//                               |  |    +--------------+      +-----^------+
+//                               |  |imm | MemtableList |            |
+//                               |  +--->+   Version s  +------------+
+//                               |       +--------------+
+//                               |       +--------------+
+//                               |       | MemtableList |
+//                               +------>+   Version t  +-------->  Empty
+//                                 imm   +--------------+
+//
+// In this example, even if the current LSM-tree consists of Version A and
+// memtable a, which is also referenced by SuperVersion, two older SuperVersion
+// SuperVersion2 and Superversion1 still exist, and are referenced by a
+// compaction job and an old iterator Iter1, respectively. SuperVersion2
+// contains Version B, memtable a and memtable b; SuperVersion1 contains
+// Version B and memtable b (mutable). As a result, Version B and memtable b
+// are prevented from being destroyed or deleted.
 
 // ColumnFamilyHandleImpl is the class that clients use to access different
 // column families. It has non-trivial destructor, which gets called when client
@@ -168,7 +275,7 @@ class ColumnFamilyData {
   // Ref() can only be called from a context where the caller can guarantee
   // that ColumnFamilyData is alive (while holding a non-zero ref already,
   // holding a DB mutex, or as the leader in a write batch group).
-  void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); }
+  void Ref() { refs_.fetch_add(1); }
 
   // Unref decreases the reference count, but does not handle deletion
   // when the count goes to 0.  If this method returns true then the
@@ -176,7 +283,7 @@ class ColumnFamilyData {
   // FreeDeadColumnFamilies().  Unref() can only be called while holding
   // a DB mutex, or during single-threaded recovery.
   bool Unref() {
-    int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed);
+    int old_refs = refs_.fetch_sub(1);
     assert(old_refs > 0);
     return old_refs == 1;
   }
@@ -232,9 +339,13 @@ class ColumnFamilyData {
 
   bool is_delete_range_supported() { return is_delete_range_supported_; }
 
+  // Validate CF options against DB options
+  static Status ValidateOptions(const DBOptions& db_options,
+                                const ColumnFamilyOptions& cf_options);
 #ifndef ROCKSDB_LITE
   // REQUIRES: DB mutex held
   Status SetOptions(
+      const DBOptions& db_options,
       const std::unordered_map<std::string, std::string>& options_map);
 #endif  // ROCKSDB_LITE
 
@@ -394,7 +505,8 @@ class ColumnFamilyData {
                    const ColumnFamilyOptions& options,
                    const ImmutableDBOptions& db_options,
                    const EnvOptions& env_options,
-                   ColumnFamilySet* column_family_set);
+                   ColumnFamilySet* column_family_set,
+                   BlockCacheTracer* const block_cache_tracer);
 
   uint32_t id_;
   const std::string name_;
@@ -522,7 +634,8 @@ class ColumnFamilySet {
                   const ImmutableDBOptions* db_options,
                   const EnvOptions& env_options, Cache* table_cache,
                   WriteBufferManager* write_buffer_manager,
-                  WriteController* write_controller);
+                  WriteController* write_controller,
+                  BlockCacheTracer* const block_cache_tracer);
   ~ColumnFamilySet();
 
   ColumnFamilyData* GetDefault() const;
@@ -581,6 +694,7 @@ class ColumnFamilySet {
   Cache* table_cache_;
   WriteBufferManager* write_buffer_manager_;
   WriteController* write_controller_;
+  BlockCacheTracer* const block_cache_tracer_;
 };
 
 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index bdc832bd235..95c43ac5ae9 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -12,20 +12,22 @@
 #include <string>
 #include <thread>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "memtable/hash_skiplist_rep.h"
 #include "options/options_parser.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
-#include "util/fault_injection_test_env.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -60,8 +62,20 @@ class EnvCounter : public EnvWrapper {
 
 class ColumnFamilyTestBase : public testing::Test {
  public:
-  ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
-    env_ = new EnvCounter(Env::Default());
+  explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
+    Env* base_env = Env::Default();
+#ifndef ROCKSDB_LITE
+    const char* test_env_uri = getenv("TEST_ENV_URI");
+    if (test_env_uri) {
+      Status s = ObjectRegistry::NewInstance()->NewSharedObject(test_env_uri,
+                                                                &env_guard_);
+      base_env = env_guard_.get();
+      EXPECT_OK(s);
+      EXPECT_NE(Env::Default(), base_env);
+    }
+#endif  // !ROCKSDB_LITE
+    EXPECT_NE(nullptr, base_env);
+    env_ = new EnvCounter(base_env);
     dbname_ = test::PerThreadDBPath("column_family_test");
     db_options_.create_if_missing = true;
     db_options_.fail_if_options_file_error = true;
@@ -532,6 +546,7 @@ class ColumnFamilyTestBase : public testing::Test {
   std::string dbname_;
   DB* db_ = nullptr;
   EnvCounter* env_;
+  std::shared_ptr<Env> env_guard_;
   Random rnd_;
   uint32_t format_;
 };
@@ -1117,22 +1132,25 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
   default_cf.arena_block_size = 4 * 4096;
   default_cf.max_write_buffer_number = 10;
   default_cf.min_write_buffer_number_to_merge = 1;
-  default_cf.max_write_buffer_number_to_maintain = 0;
+  default_cf.max_write_buffer_size_to_maintain = 0;
   one.write_buffer_size = 200000;
   one.arena_block_size = 4 * 4096;
   one.max_write_buffer_number = 10;
   one.min_write_buffer_number_to_merge = 2;
-  one.max_write_buffer_number_to_maintain = 1;
+  one.max_write_buffer_size_to_maintain =
+      static_cast<int>(one.write_buffer_size);
   two.write_buffer_size = 1000000;
   two.arena_block_size = 4 * 4096;
   two.max_write_buffer_number = 10;
   two.min_write_buffer_number_to_merge = 3;
-  two.max_write_buffer_number_to_maintain = 2;
+  two.max_write_buffer_size_to_maintain =
+      static_cast<int>(two.write_buffer_size);
   three.write_buffer_size = 4096 * 22;
   three.arena_block_size = 4096;
   three.max_write_buffer_number = 10;
   three.min_write_buffer_number_to_merge = 4;
-  three.max_write_buffer_number_to_maintain = -1;
+  three.max_write_buffer_size_to_maintain =
+      static_cast<int>(three.write_buffer_size);
 
   Reopen({default_cf, one, two, three});
 
@@ -3312,7 +3330,17 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
 
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index ce80375e0e1..4152cb37939 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -10,13 +10,13 @@
 #include <thread>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
@@ -387,7 +387,7 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
   auto l0_files_1 = collector->GetFlushedFiles();
   CompactionOptions co;
   co.compression = CompressionType::kLZ4Compression;
-  CompactionJobInfo compaction_job_info;
+  CompactionJobInfo compaction_job_info{};
   ASSERT_OK(
       db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info));
   ASSERT_EQ(compaction_job_info.base_input_level, 0);
diff --git a/db/compacted_db_impl.cc b/db/compacted_db_impl.cc
index acdaad4ec29..13cccbd7746 100644
--- a/db/compacted_db_impl.cc
+++ b/db/compacted_db_impl.cc
@@ -5,7 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 #include "db/compacted_db_impl.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "table/get_context.h"
 
@@ -37,7 +37,7 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
                             const Slice& key, PinnableSlice* value) {
   GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, key, value, nullptr, nullptr,
-                         nullptr, nullptr);
+                         true, nullptr, nullptr);
   LookupKey lkey(key, kMaxSequenceNumber);
   files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(),
                                                    &get_context, nullptr);
@@ -70,7 +70,7 @@ std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
       std::string& value = (*values)[idx];
       GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
                              GetContext::kNotFound, keys[idx], &pinnable_val,
-                             nullptr, nullptr, nullptr, nullptr);
+                             nullptr, nullptr, true, nullptr, nullptr);
       LookupKey lkey(keys[idx], kMaxSequenceNumber);
       r->Get(options, lkey.internal_key(), &get_context, nullptr);
       value.assign(pinnable_val.data(), pinnable_val.size());
diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h
index 5c574b4b9a5..8a57c5b77eb 100644
--- a/db/compacted_db_impl.h
+++ b/db/compacted_db_impl.h
@@ -5,15 +5,19 @@
 
 #pragma once
 #ifndef ROCKSDB_LITE
-#include "db/db_impl.h"
-#include <vector>
 #include <string>
+#include <vector>
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
 class CompactedDBImpl : public DBImpl {
  public:
   CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  CompactedDBImpl(const CompactedDBImpl&) = delete;
+  void operator=(const CompactedDBImpl&) = delete;
+
   virtual ~CompactedDBImpl();
 
   static Status Open(const Options& options, const std::string& dbname,
@@ -85,6 +89,15 @@ class CompactedDBImpl : public DBImpl {
       const IngestExternalFileOptions& /*ingestion_options*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
 
  private:
   friend class DB;
@@ -95,10 +108,6 @@ class CompactedDBImpl : public DBImpl {
   Version* version_;
   const Comparator* user_comparator_;
   LevelFilesBrief files_;
-
-  // No copying allowed
-  CompactedDBImpl(const CompactedDBImpl&);
-  void operator=(const CompactedDBImpl&);
 };
 }
 #endif  // ROCKSDB_LITE
diff --git a/db/compaction.cc b/db/compaction/compaction.cc
similarity index 97%
rename from db/compaction.cc
rename to db/compaction/compaction.cc
index f8805376f1d..d83bb719704 100644
--- a/db/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -7,19 +7,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <vector>
 
 #include "db/column_family.h"
+#include "db/compaction/compaction.h"
 #include "rocksdb/compaction_filter.h"
+#include "test_util/sync_point.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -550,17 +545,16 @@ bool Compaction::ShouldFormSubcompactions() const {
   }
 }
 
-uint64_t Compaction::MaxInputFileCreationTime() const {
-  uint64_t max_creation_time = 0;
+uint64_t Compaction::MinInputFileOldestAncesterTime() const {
+  uint64_t min_oldest_ancester_time = port::kMaxUint64;
   for (const auto& file : inputs_[0].files) {
-    if (file->fd.table_reader != nullptr &&
-        file->fd.table_reader->GetTableProperties() != nullptr) {
-      uint64_t creation_time =
-          file->fd.table_reader->GetTableProperties()->creation_time;
-      max_creation_time = std::max(max_creation_time, creation_time);
+    uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+    if (oldest_ancester_time != 0) {
+      min_oldest_ancester_time =
+          std::min(min_oldest_ancester_time, oldest_ancester_time);
     }
   }
-  return max_creation_time;
+  return min_oldest_ancester_time;
 }
 
 int Compaction::GetInputBaseLevel() const {
diff --git a/db/compaction.h b/db/compaction/compaction.h
similarity index 98%
rename from db/compaction.h
rename to db/compaction/compaction.h
index 2cf737b676a..dec5e607e1a 100644
--- a/db/compaction.h
+++ b/db/compaction/compaction.h
@@ -9,11 +9,13 @@
 
 #pragma once
 #include "db/version_set.h"
+#include "memory/arena.h"
 #include "options/cf_options.h"
-#include "util/arena.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
 
 // Utility for comparing sstable boundary keys. Returns -1 if either a or b is
 // null which provides the property that a==null indicates a key that is less
@@ -63,7 +65,7 @@ class ColumnFamilyData;
 class VersionStorageInfo;
 class CompactionFilter;
 
-// A Compaction encapsulates information about a compaction.
+// A Compaction encapsulates metadata about a compaction.
 class Compaction {
  public:
   Compaction(VersionStorageInfo* input_version,
@@ -289,7 +291,7 @@ class Compaction {
 
   uint32_t max_subcompactions() const { return max_subcompactions_; }
 
-  uint64_t MaxInputFileCreationTime() const;
+  uint64_t MinInputFileOldestAncesterTime() const;
 
  private:
   // mark (or clear) all files that are being compacted
@@ -376,7 +378,7 @@ class Compaction {
   CompactionReason compaction_reason_;
 };
 
-// Utility function
+// Return sum of sizes of all files in `files`.
 extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
 
 }  // namespace rocksdb
diff --git a/db/compaction_iteration_stats.h b/db/compaction/compaction_iteration_stats.h
similarity index 100%
rename from db/compaction_iteration_stats.h
rename to db/compaction/compaction_iteration_stats.h
diff --git a/db/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
similarity index 91%
rename from db/compaction_iterator.cc
rename to db/compaction/compaction_iterator.cc
index bce0b82dbc7..59097eec0cf 100644
--- a/db/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -3,13 +3,14 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/compaction_iterator.h"
+#include <cinttypes>
 
+#include "db/compaction/compaction_iterator.h"
 #include "db/snapshot_checker.h"
 #include "port/likely.h"
 #include "rocksdb/listener.h"
 #include "table/internal_iterator.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 #define DEFINITELY_IN_SNAPSHOT(seq, snapshot)                       \
   ((seq) <= (snapshot) &&                                           \
@@ -39,7 +40,8 @@ CompactionIterator::CompactionIterator(
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    SnapshotListFetchCallback* snap_list_callback)
+    const std::atomic<bool>* manual_compaction_paused,
+    const std::shared_ptr<Logger> info_log)
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots,
           earliest_write_conflict_snapshot, snapshot_checker, env,
@@ -47,7 +49,7 @@ CompactionIterator::CompactionIterator(
           std::unique_ptr<CompactionProxy>(
               compaction ? new CompactionProxy(compaction) : nullptr),
           compaction_filter, shutting_down, preserve_deletes_seqnum,
-          snap_list_callback) {}
+          manual_compaction_paused, info_log) {}
 
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@@ -60,7 +62,8 @@ CompactionIterator::CompactionIterator(
     const CompactionFilter* compaction_filter,
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
-    SnapshotListFetchCallback* snap_list_callback)
+    const std::atomic<bool>* manual_compaction_paused,
+    const std::shared_ptr<Logger> info_log)
     : input_(input),
       cmp_(cmp),
       merge_helper_(merge_helper),
@@ -74,12 +77,13 @@ CompactionIterator::CompactionIterator(
       compaction_(std::move(compaction)),
       compaction_filter_(compaction_filter),
       shutting_down_(shutting_down),
+      manual_compaction_paused_(manual_compaction_paused),
       preserve_deletes_seqnum_(preserve_deletes_seqnum),
       current_user_key_sequence_(0),
       current_user_key_snapshot_(0),
       merge_out_iter_(merge_helper_),
       current_key_committed_(false),
-      snap_list_callback_(snap_list_callback) {
+      info_log_(info_log) {
   assert(compaction_filter_ == nullptr || compaction_ != nullptr);
   assert(snapshots_ != nullptr);
   bottommost_level_ =
@@ -87,7 +91,24 @@ CompactionIterator::CompactionIterator(
   if (compaction_ != nullptr) {
     level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
   }
-  ProcessSnapshotList();
+  if (snapshots_->size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip_ = true;
+    earliest_snapshot_iter_ = snapshots_->end();
+    earliest_snapshot_ = kMaxSequenceNumber;
+    latest_snapshot_ = 0;
+  } else {
+    visible_at_tip_ = false;
+    earliest_snapshot_iter_ = snapshots_->begin();
+    earliest_snapshot_ = snapshots_->at(0);
+    latest_snapshot_ = snapshots_->back();
+  }
+#ifndef NDEBUG
+  // findEarliestVisibleSnapshot assumes this ordering.
+  for (size_t i = 1; i < snapshots_->size(); ++i) {
+    assert(snapshots_->at(i - 1) < snapshots_->at(i));
+  }
+#endif
   input_->SetPinnedItersMgr(&pinned_iters_mgr_);
   TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
 }
@@ -126,6 +147,11 @@ void CompactionIterator::Next() {
       // MergeUntil stops when it encounters a corrupt key and does not
       // include them in the result, so we expect the keys here to be valid.
       assert(valid_key);
+      if (!valid_key) {
+        ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+                        key_.ToString(true).c_str());
+      }
+
       // Keep current_key_ in sync.
       current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
       key_ = current_key_.GetInternalKey();
@@ -209,33 +235,12 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
   }
 }
 
-void CompactionIterator::ProcessSnapshotList() {
-#ifndef NDEBUG
-  // findEarliestVisibleSnapshot assumes this ordering.
-  for (size_t i = 1; i < snapshots_->size(); ++i) {
-    assert(snapshots_->at(i - 1) < snapshots_->at(i));
-  }
-#endif
-  if (snapshots_->size() == 0) {
-    // optimize for fast path if there are no snapshots
-    visible_at_tip_ = true;
-    earliest_snapshot_iter_ = snapshots_->end();
-    earliest_snapshot_ = kMaxSequenceNumber;
-    latest_snapshot_ = 0;
-  } else {
-    visible_at_tip_ = false;
-    earliest_snapshot_iter_ = snapshots_->begin();
-    earliest_snapshot_ = snapshots_->at(0);
-    latest_snapshot_ = snapshots_->back();
-  }
-  released_snapshots_.clear();
-}
-
 void CompactionIterator::NextFromInput() {
   at_next_ = false;
   valid_ = false;
 
-  while (!valid_ && input_->Valid() && !IsShuttingDown()) {
+  while (!valid_ && input_->Valid() && !IsPausingManualCompaction() &&
+         !IsShuttingDown()) {
     key_ = input_->key();
     value_ = input_->value();
     iter_stats_.num_input_records++;
@@ -278,13 +283,6 @@ void CompactionIterator::NextFromInput() {
     // compaction filter). ikey_.user_key is pointing to the copy.
     if (!has_current_user_key_ ||
         !cmp_->Equal(ikey_.user_key, current_user_key_)) {
-      num_keys_++;
-      // Use num_keys_ to reduce the overhead of reading current time
-      if (snap_list_callback_ && snapshots_->size() &&
-          snap_list_callback_->TimeToRefresh(num_keys_)) {
-        snap_list_callback_->Refresh(snapshots_, latest_snapshot_);
-        ProcessSnapshotList();
-      }
       // First occurrence of this user key
       // Copy key for output
       key_ = current_key_.SetInternalKey(key_, &ikey_);
@@ -350,7 +348,18 @@ void CompactionIterator::NextFromInput() {
       // not compact out.  We will keep this Put, but can drop it's data.
       // (See Optimization 3, below.)
       assert(ikey_.type == kTypeValue);
+      if (ikey_.type != kTypeValue) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "Unexpected key type %d for compaction output",
+                        ikey_.type);
+      }
       assert(current_user_key_snapshot_ == last_snapshot);
+      if (current_user_key_snapshot_ != last_snapshot) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "current_user_key_snapshot_ (%" PRIu64
+                        ") != last_snapshot (%" PRIu64 ")",
+                        current_user_key_snapshot_, last_snapshot);
+      }
 
       value_.clear();
       valid_ = true;
@@ -492,20 +501,12 @@ void CompactionIterator::NextFromInput() {
       // checking since there has already been a record returned for this key
       // in this snapshot.
       assert(last_sequence >= current_user_key_sequence_);
-
-      // Note2: if last_snapshot < current_user_key_snapshot, it can only
-      // mean last_snapshot is released between we process last value and
-      // this value, and findEarliestVisibleSnapshot returns the next snapshot
-      // as current_user_key_snapshot. In this case last value and current
-      // value are both in current_user_key_snapshot currently.
-      // Although last_snapshot is released we might still get a definitive
-      // response when key sequence number changes, e.g., when seq is determined
-      // too old and visible in all snapshots.
-      assert(last_snapshot == current_user_key_snapshot_ ||
-             (snapshot_checker_ != nullptr &&
-              snapshot_checker_->CheckInSnapshot(current_user_key_sequence_,
-                                                 last_snapshot) !=
-                  SnapshotCheckerResult::kNotInSnapshot));
+      if (last_sequence < current_user_key_sequence_) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "last_sequence (%" PRIu64
+                        ") < current_user_key_sequence_ (%" PRIu64 ")",
+                        last_sequence, current_user_key_sequence_);
+      }
 
       ++iter_stats_.num_record_drop_hidden;  // (A)
       input_->Next();
@@ -589,6 +590,10 @@ void CompactionIterator::NextFromInput() {
         // MergeUntil stops when it encounters a corrupt key and does not
         // include them in the result, so we expect the keys here to valid.
         assert(valid_key);
+        if (!valid_key) {
+          ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
+                          key_.ToString(true).c_str());
+        }
         // Keep current_key_ in sync.
         current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
         key_ = current_key_.GetInternalKey();
@@ -627,6 +632,10 @@ void CompactionIterator::NextFromInput() {
   if (!valid_ && IsShuttingDown()) {
     status_ = Status::ShutdownInProgress();
   }
+
+  if (IsPausingManualCompaction()) {
+    status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
 }
 
 void CompactionIterator::PrepareOutput() {
@@ -645,6 +654,11 @@ void CompactionIterator::PrepareOutput() {
       ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ && valid_ &&
       IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) {
     assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
+    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+      ROCKS_LOG_FATAL(info_log_,
+                      "Unexpected key type %d for seq-zero optimization",
+                      ikey_.type);
+    }
     ikey_.sequence = 0;
     current_key_.UpdateInternalKey(0, ikey_.type);
   }
@@ -653,6 +667,10 @@ void CompactionIterator::PrepareOutput() {
 inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
     SequenceNumber in, SequenceNumber* prev_snapshot) {
   assert(snapshots_->size());
+  if (snapshots_->size() == 0) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "No snapshot left in findEarliestVisibleSnapshot");
+  }
   auto snapshots_iter = std::lower_bound(
       snapshots_->begin(), snapshots_->end(), in);
   if (snapshots_iter == snapshots_->begin()) {
@@ -660,6 +678,10 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
   } else {
     *prev_snapshot = *std::prev(snapshots_iter);
     assert(*prev_snapshot < in);
+    if (*prev_snapshot >= in) {
+      ROCKS_LOG_FATAL(info_log_,
+                      "*prev_snapshot >= in in findEarliestVisibleSnapshot");
+    }
   }
   if (snapshot_checker_ == nullptr) {
     return snapshots_iter != snapshots_->end()
@@ -669,6 +691,9 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
   for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
     auto cur = *snapshots_iter;
     assert(in <= cur);
+    if (in > cur) {
+      ROCKS_LOG_FATAL(info_log_, "in > cur in findEarliestVisibleSnapshot");
+    }
     // Skip if cur is in released_snapshots.
     if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
       continue;
@@ -693,9 +718,14 @@ inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() {
 
 bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
   assert(snapshot_checker_ != nullptr);
-  assert(earliest_snapshot_ == kMaxSequenceNumber ||
-         (earliest_snapshot_iter_ != snapshots_->end() &&
-          *earliest_snapshot_iter_ == earliest_snapshot_));
+  bool pre_condition = (earliest_snapshot_ == kMaxSequenceNumber ||
+                        (earliest_snapshot_iter_ != snapshots_->end() &&
+                         *earliest_snapshot_iter_ == earliest_snapshot_));
+  assert(pre_condition);
+  if (!pre_condition) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "Pre-Condition is not hold in IsInEarliestSnapshot");
+  }
   auto in_snapshot =
       snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
   while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) {
@@ -714,6 +744,10 @@ bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
         snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
   }
   assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased);
+  if (in_snapshot == SnapshotCheckerResult::kSnapshotReleased) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "Unexpected released snapshot in IsInEarliestSnapshot");
+  }
   return in_snapshot == SnapshotCheckerResult::kInSnapshot;
 }
 
diff --git a/db/compaction_iterator.h b/db/compaction/compaction_iterator.h
similarity index 66%
rename from db/compaction_iterator.h
rename to db/compaction/compaction_iterator.h
index 6ab43b1becf..1e08b407d28 100644
--- a/db/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -10,8 +10,8 @@
 #include <unordered_set>
 #include <vector>
 
-#include "db/compaction.h"
-#include "db/compaction_iteration_stats.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iteration_stats.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_del_aggregator.h"
@@ -21,53 +21,6 @@
 
 namespace rocksdb {
 
-// This callback can be used to refresh the snapshot list from the db. It
-// includes logics to exponentially decrease the refresh rate to limit the
-// overhead of refresh.
-class SnapshotListFetchCallback {
- public:
-  SnapshotListFetchCallback(Env* env, uint64_t snap_refresh_nanos,
-                            size_t every_nth_key = 1024)
-      : timer_(env, /*auto restart*/ true),
-        snap_refresh_nanos_(snap_refresh_nanos),
-        every_nth_key_minus_one_(every_nth_key - 1) {
-    assert(every_nth_key > 0);
-    assert((ceil(log2(every_nth_key)) == floor(log2(every_nth_key))));
-  }
-  // Refresh the snapshot list. snapshots will bre replacted with the new list.
-  // max is the upper bound. Note: this function will acquire the db_mutex_.
-  virtual void Refresh(std::vector<SequenceNumber>* snapshots,
-                       SequenceNumber max) = 0;
-  inline bool TimeToRefresh(const size_t key_index) {
-    // skip the key if key_index % every_nth_key (which is of power 2) is not 0.
-    if ((key_index & every_nth_key_minus_one_) != 0) {
-      return false;
-    }
-    const uint64_t elapsed = timer_.ElapsedNanos();
-    auto ret = elapsed > snap_refresh_nanos_;
-    // pre-compute the next time threshold
-    if (ret) {
-      // inc next refresh period exponentially (by x4)
-      auto next_refresh_threshold = snap_refresh_nanos_ << 2;
-      // make sure the shift has not overflown the highest 1 bit
-      snap_refresh_nanos_ =
-          std::max(snap_refresh_nanos_, next_refresh_threshold);
-    }
-    return ret;
-  }
-  static constexpr SnapshotListFetchCallback* kDisabled = nullptr;
-
-  virtual ~SnapshotListFetchCallback() {}
-
- private:
-  // Time since the callback was created
-  StopWatchNano timer_;
-  // The delay before calling ::Refresh. To be increased exponentially.
-  uint64_t snap_refresh_nanos_;
-  // Skip evey nth key. Number n if of power 2. The math will require n-1.
-  const uint64_t every_nth_key_minus_one_;
-};
-
 class CompactionIterator {
  public:
   // A wrapper around Compaction. Has a much smaller interface, only what
@@ -106,32 +59,34 @@ class CompactionIterator {
     const Compaction* compaction_;
   };
 
-  CompactionIterator(InternalIterator* input, const Comparator* cmp,
-                     MergeHelper* merge_helper, SequenceNumber last_sequence,
-                     std::vector<SequenceNumber>* snapshots,
-                     SequenceNumber earliest_write_conflict_snapshot,
-                     const SnapshotChecker* snapshot_checker, Env* env,
-                     bool report_detailed_time, bool expect_valid_internal_key,
-                     CompactionRangeDelAggregator* range_del_agg,
-                     const Compaction* compaction = nullptr,
-                     const CompactionFilter* compaction_filter = nullptr,
-                     const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0,
-                     SnapshotListFetchCallback* snap_list_callback = nullptr);
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, Env* env,
+      bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      const Compaction* compaction = nullptr,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const SequenceNumber preserve_deletes_seqnum = 0,
+      const std::atomic<bool>* manual_compaction_paused = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr);
 
   // Constructor with custom CompactionProxy, used for tests.
-  CompactionIterator(InternalIterator* input, const Comparator* cmp,
-                     MergeHelper* merge_helper, SequenceNumber last_sequence,
-                     std::vector<SequenceNumber>* snapshots,
-                     SequenceNumber earliest_write_conflict_snapshot,
-                     const SnapshotChecker* snapshot_checker, Env* env,
-                     bool report_detailed_time, bool expect_valid_internal_key,
-                     CompactionRangeDelAggregator* range_del_agg,
-                     std::unique_ptr<CompactionProxy> compaction,
-                     const CompactionFilter* compaction_filter = nullptr,
-                     const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0,
-                     SnapshotListFetchCallback* snap_list_callback = nullptr);
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, Env* env,
+      bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      std::unique_ptr<CompactionProxy> compaction,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const SequenceNumber preserve_deletes_seqnum = 0,
+      const std::atomic<bool>* manual_compaction_paused = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr);
 
   ~CompactionIterator();
 
@@ -159,8 +114,6 @@ class CompactionIterator {
  private:
   // Processes the input stream to find the next output
   void NextFromInput();
-  // Process snapshots_ and assign related variables
-  void ProcessSnapshotList();
 
   // Do last preparations before presenting the output to the callee. At this
   // point this only zeroes out the sequence number if possible for better
@@ -195,7 +148,7 @@ class CompactionIterator {
   InternalIterator* input_;
   const Comparator* cmp_;
   MergeHelper* merge_helper_;
-  std::vector<SequenceNumber>* snapshots_;
+  const std::vector<SequenceNumber>* snapshots_;
   // List of snapshots released during compaction.
   // findEarliestVisibleSnapshot() find them out from return of
   // snapshot_checker, and make sure they will not be returned as
@@ -212,6 +165,7 @@ class CompactionIterator {
   std::unique_ptr<CompactionProxy> compaction_;
   const CompactionFilter* compaction_filter_;
   const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>* manual_compaction_paused_;
   const SequenceNumber preserve_deletes_seqnum_;
   bool bottommost_level_;
   bool valid_ = false;
@@ -270,13 +224,17 @@ class CompactionIterator {
   // Used to avoid purging uncommitted values. The application can specify
   // uncommitted values by providing a SnapshotChecker object.
   bool current_key_committed_;
-  SnapshotListFetchCallback* snap_list_callback_;
-  // number of distinct keys processed
-  size_t num_keys_ = 0;
+  std::shared_ptr<Logger> info_log_;
 
   bool IsShuttingDown() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
     return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
   }
+
+  bool IsPausingManualCompaction() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return manual_compaction_paused_ &&
+           manual_compaction_paused_->load(std::memory_order_relaxed);
+  }
 };
 }  // namespace rocksdb
diff --git a/db/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc
similarity index 99%
rename from db/compaction_iterator_test.cc
rename to db/compaction/compaction_iterator_test.cc
index c466f6c9122..94f297961e2 100644
--- a/db/compaction_iterator_test.cc
+++ b/db/compaction/compaction_iterator_test.cc
@@ -3,15 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/compaction_iterator.h"
 
 #include <string>
 #include <vector>
 
+#include "db/compaction/compaction_iterator.h"
 #include "port/port.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -184,7 +184,7 @@ class TestSnapshotChecker : public SnapshotChecker {
  public:
   explicit TestSnapshotChecker(
       SequenceNumber last_committed_sequence,
-      const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = {})
+      const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = {{}})
       : last_committed_sequence_(last_committed_sequence),
         snapshots_(snapshots) {}
 
diff --git a/db/compaction_job.cc b/db/compaction/compaction_job.cc
similarity index 94%
rename from db/compaction_job.cc
rename to db/compaction/compaction_job.cc
index bc127a4c45c..22c504fde26 100644
--- a/db/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -7,14 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction_job.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
+#include <cinttypes>
 #include <functional>
 #include <list>
 #include <memory>
@@ -25,7 +19,8 @@
 #include <vector>
 
 #include "db/builder.h"
-#include "db/db_impl.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/error_handler.h"
@@ -38,6 +33,12 @@
 #include "db/merge_helper.h"
 #include "db/range_del_aggregator.h"
 #include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
@@ -47,21 +48,16 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
-#include "util/filename.h"
-#include "util/log_buffer.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -315,7 +311,7 @@ CompactionJob::CompactionJob(
     const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
     EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
     const std::string& dbname, CompactionJobStats* compaction_job_stats,
-    Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback)
+    Env::Priority thread_pri, const std::atomic<bool>* manual_compaction_paused)
     : job_id_(job_id),
       compact_(new CompactionState(compaction)),
       compaction_job_stats_(compaction_job_stats),
@@ -324,10 +320,11 @@ CompactionJob::CompactionJob(
       db_options_(db_options),
       env_options_(env_options),
       env_(db_options.env),
-      env_optiosn_for_read_(
+      env_options_for_read_(
           env_->OptimizeForCompactionTableRead(env_options, db_options_)),
       versions_(versions),
       shutting_down_(shutting_down),
+      manual_compaction_paused_(manual_compaction_paused),
       preserve_deletes_seqnum_(preserve_deletes_seqnum),
       log_buffer_(log_buffer),
       db_directory_(db_directory),
@@ -336,7 +333,6 @@ CompactionJob::CompactionJob(
       db_mutex_(db_mutex),
       db_error_handler_(db_error_handler),
       existing_snapshots_(std::move(existing_snapshots)),
-      snap_list_callback_(snap_list_callback),
       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
       snapshot_checker_(snapshot_checker),
       table_cache_(std::move(table_cache)),
@@ -415,7 +411,6 @@ void CompactionJob::Prepare() {
 
   write_hint_ =
       c->column_family_data()->CalculateSSTWriteHint(c->output_level());
-  // Is this compaction producing files at the bottommost level?
   bottommost_level_ = c->bottommost_level();
 
   if (c->ShouldFormSubcompactions()) {
@@ -445,11 +440,6 @@ struct RangeWithSize {
       : range(a, b), size(s) {}
 };
 
-// Generates a histogram representing potential divisions of key ranges from
-// the input. It adds the starting and/or ending keys of certain input files
-// to the working set and then finds the approximate size of data in between
-// each consecutive pair of slices. Then it divides these ranges into
-// consecutive groups such that each group has a similar size.
 void CompactionJob::GenSubcompactionBoundaries() {
   auto* c = compact_->compaction;
   auto* cfd = c->column_family_data();
@@ -519,7 +509,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
   auto* v = compact_->compaction->input_version();
   for (auto it = bounds.begin();;) {
     const Slice a = *it;
-    it++;
+    ++it;
 
     if (it == bounds.end()) {
       break;
@@ -531,7 +521,9 @@ void CompactionJob::GenSubcompactionBoundaries() {
     // to the index block and may incur I/O cost in the process. Unlock db
     // mutex to reduce contention
     db_mutex_->Unlock();
-    uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1);
+    uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
+                                               b, start_lvl, out_lvl + 1,
+                                               TableReaderCaller::kCompaction);
     db_mutex_->Lock();
     ranges.emplace_back(a, b, size);
     sum += size;
@@ -656,12 +648,14 @@ Status CompactionJob::Run() {
         // to cache it here for further user reads
         InternalIterator* iter = cfd->table_cache()->NewIterator(
             ReadOptions(), env_options_, cfd->internal_comparator(),
-            *files_meta[file_idx], nullptr /* range_del_agg */,
-            prefix_extractor, nullptr,
+            *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor,
+            /*table_reader_ptr=*/nullptr,
             cfd->internal_stats()->GetFileReadHist(
                 compact_->compaction->output_level()),
-            false, nullptr /* arena */, false /* skip_filters */,
-            compact_->compaction->output_level());
+            TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+            /*skip_filters=*/false, compact_->compaction->output_level(),
+            /*smallest_compaction_key=*/nullptr,
+            /*largest_compaction_key=*/nullptr);
         auto s = iter->status();
 
         if (s.ok() && paranoid_file_checks_) {
@@ -842,7 +836,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   // Although the v2 aggregator is what the level iterator(s) know about,
   // the AddTombstones calls will be propagated down to the v1 aggregator.
   std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
-      sub_compact->compaction, &range_del_agg, env_optiosn_for_read_));
+      sub_compact->compaction, &range_del_agg, env_options_for_read_));
 
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
@@ -873,9 +867,13 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       false /* internal key corruption is expected */,
       existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
       snapshot_checker_, compact_->compaction->level(),
-      db_options_.statistics.get(), shutting_down_);
+      db_options_.statistics.get());
 
   TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      reinterpret_cast<void*>(
+          const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
 
   Slice* start = sub_compact->start;
   Slice* end = sub_compact->end;
@@ -893,7 +891,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       &existing_snapshots_, earliest_write_conflict_snapshot_,
       snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
       &range_del_agg, sub_compact->compaction, compaction_filter,
-      shutting_down_, preserve_deletes_seqnum_, snap_list_callback_));
+      shutting_down_, preserve_deletes_seqnum_, manual_compaction_paused_,
+      db_options_.info_log));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
   if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
@@ -935,8 +934,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     assert(sub_compact->current_output() != nullptr);
     sub_compact->builder->Add(key, value);
     sub_compact->current_output_file_size = sub_compact->builder->FileSize();
+    const ParsedInternalKey& ikey = c_iter->ikey();
     sub_compact->current_output()->meta.UpdateBoundaries(
-        key, c_iter->ikey().sequence);
+        key, value, ikey.sequence, ikey.type);
     sub_compact->num_output_records++;
 
     // Close output file if it is big enough. Two possibilities determine it's
@@ -957,7 +957,14 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       input_status = input->status();
       output_file_ended = true;
     }
+    TEST_SYNC_POINT_CALLBACK(
+        "CompactionJob::Run():PausingManualCompaction:2",
+        reinterpret_cast<void*>(
+            const_cast<std::atomic<bool>*>(manual_compaction_paused_)));
     c_iter->Next();
+    if (c_iter->status().IsManualCompactionPaused()) {
+      break;
+    }
     if (!output_file_ended && c_iter->Valid() &&
         sub_compact->compaction->output_level() != 0 &&
         sub_compact->ShouldStopBefore(c_iter->key(),
@@ -1002,10 +1009,18 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
   RecordCompactionIOStats();
 
-  if (status.ok() &&
-      (shutting_down_->load(std::memory_order_relaxed) || cfd->IsDropped())) {
-    status = Status::ShutdownInProgress(
-        "Database shutdown or Column family drop during compaction");
+  if (status.ok() && cfd->IsDropped()) {
+    status =
+        Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_relaxed)) {
+    status = Status::ShutdownInProgress("Database shutdown");
+  }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      (manual_compaction_paused_ &&
+       manual_compaction_paused_->load(std::memory_order_relaxed))) {
+    status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
   if (status.ok()) {
     status = input->status();
@@ -1336,17 +1351,20 @@ Status CompactionJob::FinishCompactionOutputFile(
   }
   std::string fname;
   FileDescriptor output_fd;
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
   if (meta != nullptr) {
     fname =
         TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
                       meta->fd.GetNumber(), meta->fd.GetPathId());
     output_fd = meta->fd;
+    oldest_blob_file_number = meta->oldest_blob_file_number;
   } else {
     fname = "(nil)";
   }
   EventHelpers::LogAndNotifyTableFileCreationFinished(
       event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
-      job_id_, output_fd, tp, TableFileCreationReason::kCompaction, s);
+      job_id_, output_fd, oldest_blob_file_number, tp,
+      TableFileCreationReason::kCompaction, s);
 
 #ifndef ROCKSDB_LITE
   // Report new file to SstFileManagerImpl
@@ -1456,17 +1474,38 @@ Status CompactionJob::OpenCompactionOutputFile(
     LogFlush(db_options_.info_log);
     EventHelpers::LogAndNotifyTableFileCreationFinished(
         event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
-        fname, job_id_, FileDescriptor(), TableProperties(),
-        TableFileCreationReason::kCompaction, s);
+        fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
+        TableProperties(), TableFileCreationReason::kCompaction, s);
     return s;
   }
 
-  SubcompactionState::Output out;
-  out.meta.fd =
-      FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0);
-  out.finished = false;
+  // Try to figure out the output file's oldest ancester time.
+  int64_t temp_current_time = 0;
+  auto get_time_status = env_->GetCurrentTime(&temp_current_time);
+  // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+  if (!get_time_status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get current time. Status: %s",
+                   get_time_status.ToString().c_str());
+  }
+  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+  uint64_t oldest_ancester_time =
+      sub_compact->compaction->MinInputFileOldestAncesterTime();
+  if (oldest_ancester_time == port::kMaxUint64) {
+    oldest_ancester_time = current_time;
+  }
+
+  // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+  {
+    SubcompactionState::Output out;
+    out.meta.fd = FileDescriptor(file_number,
+                                 sub_compact->compaction->output_path_id(), 0);
+    out.meta.oldest_ancester_time = oldest_ancester_time;
+    out.meta.file_creation_time = current_time;
+    out.finished = false;
+    sub_compact->outputs.push_back(out);
+  }
 
-  sub_compact->outputs.push_back(out);
   writable_file->SetIOPriority(Env::IO_LOW);
   writable_file->SetWriteLifeTimeHint(write_hint_);
   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
@@ -1483,22 +1522,6 @@ Status CompactionJob::OpenCompactionOutputFile(
   bool skip_filters =
       cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
 
-  int64_t temp_current_time = 0;
-  auto get_time_status = env_->GetCurrentTime(&temp_current_time);
-  // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
-  if (!get_time_status.ok()) {
-    ROCKS_LOG_WARN(db_options_.info_log,
-                   "Failed to get current time. Status: %s",
-                   get_time_status.ToString().c_str());
-  }
-  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
-
-  uint64_t latest_key_time =
-      sub_compact->compaction->MaxInputFileCreationTime();
-  if (latest_key_time == 0) {
-    latest_key_time = current_time;
-  }
-
   sub_compact->builder.reset(NewTableBuilder(
       *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
       cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
@@ -1506,9 +1529,9 @@ Status CompactionJob::OpenCompactionOutputFile(
       sub_compact->compaction->output_compression(),
       0 /*sample_for_compression */,
       sub_compact->compaction->output_compression_opts(),
-      sub_compact->compaction->output_level(), skip_filters, latest_key_time,
-      0 /* oldest_key_time */, sub_compact->compaction->max_output_file_size(),
-      current_time));
+      sub_compact->compaction->output_level(), skip_filters,
+      oldest_ancester_time, 0 /* oldest_key_time */,
+      sub_compact->compaction->max_output_file_size(), current_time));
   LogFlush(db_options_.info_log);
   return s;
 }
diff --git a/db/compaction_job.h b/db/compaction/compaction_job.h
similarity index 68%
rename from db/compaction_job.h
rename to db/compaction/compaction_job.h
index b3a0f2eb4b5..bca4c994265 100644
--- a/db/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/compaction_iterator.h"
+#include "db/compaction/compaction_iterator.h"
 #include "db/dbformat.h"
 #include "db/flush_scheduler.h"
 #include "db/internal_stats.h"
@@ -29,6 +29,7 @@
 #include "db/version_edit.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
+#include "logging/event_logger.h"
 #include "options/cf_options.h"
 #include "options/db_options.h"
 #include "port/port.h"
@@ -40,7 +41,6 @@
 #include "rocksdb/transaction_log.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/autovector.h"
-#include "util/event_logger.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
 
@@ -55,22 +55,30 @@ class Version;
 class VersionEdit;
 class VersionSet;
 
+// CompactionJob is responsible for executing the compaction. Each (manual or
+// automated) compaction corresponds to a CompactionJob object, and usually
+// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
+// will divide the compaction into subcompactions and execute them in parallel
+// if needed.
 class CompactionJob {
  public:
-  CompactionJob(
-      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
-      const EnvOptions env_options, VersionSet* versions,
-      const std::atomic<bool>* shutting_down,
-      const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
-      Directory* db_directory, Directory* output_directory, Statistics* stats,
-      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-      std::vector<SequenceNumber> existing_snapshots,
-      SequenceNumber earliest_write_conflict_snapshot,
-      const SnapshotChecker* snapshot_checker,
-      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-      bool paranoid_file_checks, bool measure_io_stats,
-      const std::string& dbname, CompactionJobStats* compaction_job_stats,
-      Env::Priority thread_pri, SnapshotListFetchCallback* snap_list_callback);
+  CompactionJob(int job_id, Compaction* compaction,
+                const ImmutableDBOptions& db_options,
+                const EnvOptions env_options, VersionSet* versions,
+                const std::atomic<bool>* shutting_down,
+                const SequenceNumber preserve_deletes_seqnum,
+                LogBuffer* log_buffer, Directory* db_directory,
+                Directory* output_directory, Statistics* stats,
+                InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+                std::vector<SequenceNumber> existing_snapshots,
+                SequenceNumber earliest_write_conflict_snapshot,
+                const SnapshotChecker* snapshot_checker,
+                std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+                bool paranoid_file_checks, bool measure_io_stats,
+                const std::string& dbname,
+                CompactionJobStats* compaction_job_stats,
+                Env::Priority thread_pri,
+                const std::atomic<bool>* manual_compaction_paused = nullptr);
 
   ~CompactionJob();
 
@@ -80,17 +88,28 @@ class CompactionJob {
   CompactionJob& operator=(const CompactionJob& job) = delete;
 
   // REQUIRED: mutex held
+  // Prepare for the compaction by setting up boundaries for each subcompaction
   void Prepare();
   // REQUIRED mutex not held
+  // Launch threads for each subcompaction and wait for them to finish. After
+  // that, verify table is usable and finally do bookkeeping to unify
+  // subcompaction results
   Status Run();
 
   // REQUIRED: mutex held
+  // Add compaction input/output to the current version
   Status Install(const MutableCFOptions& mutable_cf_options);
 
  private:
   struct SubcompactionState;
 
   void AggregateStatistics();
+
+  // Generates a histogram representing potential divisions of key ranges from
+  // the input. It adds the starting and/or ending keys of certain input files
+  // to the working set and then finds the approximate size of data in between
+  // each consecutive pair of slices. Then it divides these ranges into
+  // consecutive groups such that each group has a similar size.
   void GenSubcompactionBoundaries();
 
   // update the thread status for starting a compaction.
@@ -135,9 +154,10 @@ class CompactionJob {
 
   Env* env_;
   // env_option optimized for compaction table reads
-  EnvOptions env_optiosn_for_read_;
+  EnvOptions env_options_for_read_;
   VersionSet* versions_;
   const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>* manual_compaction_paused_;
   const SequenceNumber preserve_deletes_seqnum_;
   LogBuffer* log_buffer_;
   Directory* db_directory_;
@@ -150,7 +170,6 @@ class CompactionJob {
   // entirely within s1 and s2, then the earlier version of k1 can be safely
   // deleted because that version is not visible in any snapshot.
   std::vector<SequenceNumber> existing_snapshots_;
-  SnapshotListFetchCallback* snap_list_callback_;
 
   // This is the earliest snapshot that could be used for write-conflict
   // checking by a transaction.  For any user-key newer than this snapshot, we
@@ -163,6 +182,7 @@ class CompactionJob {
 
   EventLogger* event_logger_;
 
+  // Is this compaction creating a file in the bottom most level?
   bool bottommost_level_;
   bool paranoid_file_checks_;
   bool measure_io_stats_;
diff --git a/db/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc
similarity index 99%
rename from db/compaction_job_stats_test.cc
rename to db/compaction/compaction_job_stats_test.cc
index 48e883bc6cc..f25a38ec65e 100644
--- a/db/compaction_job_stats_test.cc
+++ b/db/compaction/compaction_job_stats_test.cc
@@ -7,12 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
+#include <cinttypes>
 #include <iostream>
 #include <mutex>
 #include <queue>
@@ -21,12 +17,14 @@
 #include <unordered_set>
 #include <utility>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/job_context.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
+#include "logging/logging.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "monitoring/thread_status_util.h"
@@ -47,20 +45,18 @@
 #include "rocksdb/thread_status.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/compression.h"
-#include "util/filename.h"
 #include "util/hash.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 #if !defined(IOS_CROSS_COMPILE)
diff --git a/db/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
similarity index 86%
rename from db/compaction_job_test.cc
rename to db/compaction/compaction_job_test.cc
index 60394cc9735..9fb3f0df5dc 100644
--- a/db/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -5,30 +5,28 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
 #include <array>
+#include <cinttypes>
 #include <map>
 #include <string>
 #include <tuple>
 
+#include "db/blob_index.h"
 #include "db/column_family.h"
-#include "db/compaction_job.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/version_set.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
-#include "util/file_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -81,7 +79,8 @@ class CompactionJobTest : public testing::Test {
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_buffer_manager_,
-                                 &write_controller_)),
+                                 &write_controller_,
+                                 /*block_cache_tracer=*/nullptr)),
         shutting_down_(false),
         preserve_deletes_seqnum_(0),
         mock_table_factory_(new mock::MockTableFactory()),
@@ -99,11 +98,34 @@ class CompactionJobTest : public testing::Test {
     return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
   }
 
-  std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
-      const ValueType t) {
+  static std::string KeyStr(const std::string& user_key,
+                            const SequenceNumber seq_num, const ValueType t) {
     return InternalKey(user_key, seq_num, t).Encode().ToString();
   }
 
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
+                                uint64_t size, uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+                             size, kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrInlinedTTL(const Slice& value,
+                                       uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+    return blob_index;
+  }
+
   void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) {
     assert(contents.size() > 0);
 
@@ -112,12 +134,13 @@ class CompactionJobTest : public testing::Test {
     InternalKey smallest_key, largest_key;
     SequenceNumber smallest_seqno = kMaxSequenceNumber;
     SequenceNumber largest_seqno = 0;
+    uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
     for (auto kv : contents) {
       ParsedInternalKey key;
       std::string skey;
       std::string value;
       std::tie(skey, value) = kv;
-      ParseInternalKey(skey, &key);
+      bool parsed = ParseInternalKey(skey, &key);
 
       smallest_seqno = std::min(smallest_seqno, key.sequence);
       largest_seqno = std::max(largest_seqno, key.sequence);
@@ -134,6 +157,24 @@ class CompactionJobTest : public testing::Test {
       }
 
       first_key = false;
+
+      if (parsed && key.type == kTypeBlobIndex) {
+        BlobIndex blob_index;
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          continue;
+        }
+
+        if (blob_index.IsInlined() || blob_index.HasTTL() ||
+            blob_index.file_number() == kInvalidBlobFileNumber) {
+          continue;
+        }
+
+        if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+            oldest_blob_file_number > blob_index.file_number()) {
+          oldest_blob_file_number = blob_index.file_number();
+        }
+      }
     }
 
     uint64_t file_number = versions_->NewFileNumber();
@@ -142,7 +183,8 @@ class CompactionJobTest : public testing::Test {
 
     VersionEdit edit;
     edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
-        smallest_seqno, largest_seqno, false);
+                 smallest_seqno, largest_seqno, false, oldest_blob_file_number,
+                 kUnknownOldestAncesterTime, kUnknownFileCreationTime);
 
     mutex_.Lock();
     versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
@@ -204,10 +246,18 @@ class CompactionJobTest : public testing::Test {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
     versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
                                    table_cache_.get(), &write_buffer_manager_,
-                                   &write_controller_));
+                                   &write_controller_,
+                                   /*block_cache_tracer=*/nullptr));
     compaction_job_stats_.Reset();
+    SetIdentityFile(env_, dbname_);
 
     VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
@@ -245,8 +295,7 @@ class CompactionJobTest : public testing::Test {
       const std::vector<SequenceNumber>& snapshots = {},
       SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
       int output_level = 1, bool verify = true,
-      SnapshotListFetchCallback* snapshot_fetcher =
-          SnapshotListFetchCallback::kDisabled) {
+      uint64_t expected_oldest_blob_file_number = kInvalidBlobFileNumber) {
     auto cfd = versions_->GetColumnFamilySet()->GetDefault();
 
     size_t num_input_files = 0;
@@ -279,7 +328,7 @@ class CompactionJobTest : public testing::Test {
         nullptr, nullptr, &mutex_, &error_handler_, snapshots,
         earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
         &event_logger, false, false, dbname_, &compaction_job_stats_,
-        Env::Priority::USER, snapshot_fetcher);
+        Env::Priority::USER);
     VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
 
     compaction_job.Prepare();
@@ -292,15 +341,20 @@ class CompactionJobTest : public testing::Test {
     mutex_.Unlock();
 
     if (verify) {
-      if (expected_results.size() == 0) {
-        ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
-        ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+
+      if (expected_results.empty()) {
         ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
       } else {
-        ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
-        ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
         ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
         mock_table_factory_->AssertLatestFile(expected_results);
+
+        auto output_files =
+            cfd->current()->storage_info()->LevelFiles(output_level);
+        ASSERT_EQ(output_files.size(), 1);
+        ASSERT_EQ(output_files[0]->oldest_blob_file_number,
+                  expected_oldest_blob_file_number);
       }
     }
   }
@@ -956,103 +1010,52 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
   RunCompaction({files}, expected_results);
 }
 
-// Test the snapshot fetcher in compaction
-TEST_F(CompactionJobTest, SnapshotRefresh) {
-  uint64_t time_seed = env_->NowMicros();
-  printf("time_seed is %" PRIu64 "\n", time_seed);  // would help to reproduce
-  Random64 rand(time_seed);
-  std::vector<SequenceNumber> db_snapshots;
-  class SnapshotListFetchCallbackTest : public SnapshotListFetchCallback {
-   public:
-    SnapshotListFetchCallbackTest(Env* env, Random64& rand,
-                                  std::vector<SequenceNumber>* snapshots)
-        : SnapshotListFetchCallback(env, 0 /*no time delay*/,
-                                    1 /*fetch after each key*/),
-          rand_(rand),
-          snapshots_(snapshots) {}
-    virtual void Refresh(std::vector<SequenceNumber>* snapshots,
-                         SequenceNumber) override {
-      assert(snapshots->size());
-      assert(snapshots_->size());
-      assert(snapshots_->size() == snapshots->size());
-      if (rand_.OneIn(2)) {
-        uint64_t release_index = rand_.Uniform(snapshots_->size());
-        snapshots_->erase(snapshots_->begin() + release_index);
-        *snapshots = *snapshots_;
-      }
-    }
-
-   private:
-    Random64 rand_;
-    std::vector<SequenceNumber>* snapshots_;
-  } snapshot_fetcher(env_, rand, &db_snapshots);
-
-  std::vector<std::pair<const std::string, std::string>> file1_kvs, file2_kvs;
-  std::array<ValueType, 4> types = {kTypeValue, kTypeDeletion,
-                                    kTypeSingleDeletion};
-  SequenceNumber last_seq = 0;
-  for (int i = 1; i < 100; i++) {
-    SequenceNumber seq = last_seq + 1;
-    last_seq = seq;
-    if (rand.OneIn(2)) {
-      auto type = types[rand.Uniform(types.size())];
-      file1_kvs.push_back(
-          {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)});
-    }
-  }
-  auto file1 = mock::MakeMockFile(file1_kvs);
-  for (int i = 1; i < 100; i++) {
-    SequenceNumber seq = last_seq + 1;
-    last_seq++;
-    if (rand.OneIn(2)) {
-      auto type = types[rand.Uniform(types.size())];
-      file2_kvs.push_back(
-          {test::KeyStr("k" + ToString(i), seq, type), "v" + ToString(i)});
-    }
-  }
-  auto file2 = mock::MakeMockFile(file2_kvs);
-  for (SequenceNumber i = 1; i < last_seq + 1; i++) {
-    if (rand.OneIn(5)) {
-      db_snapshots.push_back(i);
-    }
-  }
-
-  const bool kVerify = true;
-  const int output_level_0 = 0;
+TEST_F(CompactionJobTest, OldestBlobFileNumber) {
   NewDB();
-  AddMockFile(file1);
-  AddMockFile(file2);
-  SetLastSequence(last_seq);
-  auto files = cfd_->current()->storage_info()->LevelFiles(0);
-  // put the output on L0 since it is easier to feed them again to the 2nd
-  // compaction
-  RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber,
-                output_level_0, !kVerify, &snapshot_fetcher);
-
-  // Now db_snapshots are changed. Run the compaction again without snapshot
-  // fetcher but with the updated snapshot list.
-  compaction_job_stats_.Reset();
-  files = cfd_->current()->storage_info()->LevelFiles(0);
-  RunCompaction({files}, file1, db_snapshots, kMaxSequenceNumber,
-                output_level_0 + 1, !kVerify);
-  // The result should be what we get if we run compaction without snapshot
-  // fetcher on the updated list of snapshots
-  auto expected = mock_table_factory_->output();
 
-  NewDB();
+  // Note: blob1 is inlined TTL, so it will not be considered for the purposes
+  // of identifying the oldest referenced blob file. Similarly, blob6 will be
+  // ignored because it has TTL and hence refers to a TTL blob file.
+  const stl_wrappers::KVMap::value_type blob1(
+      KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
+  const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
+                                              BlobStr(59, 123456, 999));
+  const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
+                                              BlobStr(138, 1000, 1 << 8));
+  auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
   AddMockFile(file1);
+
+  const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
+                                              BlobStr(199, 3 << 10, 1 << 20));
+  const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
+                                              BlobStr(19, 6789, 333));
+  const stl_wrappers::KVMap::value_type blob6(
+      KeyStr("f", 6U, kTypeBlobIndex),
+      BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
+  auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
   AddMockFile(file2);
-  SetLastSequence(last_seq);
-  files = cfd_->current()->storage_info()->LevelFiles(0);
-  RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber,
-                output_level_0, !kVerify);
-  // The 2nd compaction above would get rid of useless delete markers. To get
-  // the output here exactly as what we got above after two compactions, we also
-  // run the compaction for 2nd time.
-  compaction_job_stats_.Reset();
-  files = cfd_->current()->storage_info()->LevelFiles(0);
-  RunCompaction({files}, expected, db_snapshots, kMaxSequenceNumber,
-                output_level_0 + 1, !kVerify);
+
+  const stl_wrappers::KVMap::value_type expected_blob1(
+      KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
+  const stl_wrappers::KVMap::value_type expected_blob2(
+      KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
+  const stl_wrappers::KVMap::value_type expected_blob3(
+      KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
+  const stl_wrappers::KVMap::value_type expected_blob4(
+      KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
+  const stl_wrappers::KVMap::value_type expected_blob5(
+      KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
+  const stl_wrappers::KVMap::value_type expected_blob6(
+      KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
+  auto expected_results =
+      mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
+                          expected_blob4, expected_blob5, expected_blob6});
+
+  SetLastSequence(6U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, std::vector<SequenceNumber>(),
+                kMaxSequenceNumber, /* output_level */ 1, /* verify */ true,
+                /* expected_oldest_blob_file_number */ 19);
 }
 
 }  // namespace rocksdb
diff --git a/db/compaction_picker.cc b/db/compaction/compaction_picker.cc
similarity index 67%
rename from db/compaction_picker.cc
rename to db/compaction/compaction_picker.cc
index d6d7b69876e..0461ff32d1c 100644
--- a/db/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -7,25 +7,21 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction_picker.h"
+#include "db/compaction/compaction_picker.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <limits>
 #include <queue>
 #include <string>
 #include <utility>
 #include <vector>
 #include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
 #include "monitoring/statistics.h"
-#include "util/filename.h"
-#include "util/log_buffer.h"
+#include "test_util/sync_point.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -42,29 +38,54 @@ uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
 bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
                            size_t min_files_to_compact,
                            uint64_t max_compact_bytes_per_del_file,
-                           CompactionInputFiles* comp_inputs) {
-  size_t compact_bytes = static_cast<size_t>(level_files[0]->fd.file_size);
+                           uint64_t max_compaction_bytes,
+                           CompactionInputFiles* comp_inputs,
+                           SequenceNumber earliest_mem_seqno) {
+  // Do not pick ingested file when there is at least one memtable not flushed
+  // which of seqno is overlap with the sst.
+  TEST_SYNC_POINT("FindIntraL0Compaction");
+  size_t start = 0;
+  for (; start < level_files.size(); start++) {
+    if (level_files[start]->being_compacted) {
+      return false;
+    }
+    // If there is no data in memtable, the earliest sequence number would the
+    // largest sequence number in last memtable.
+    // Because all files are sorted in descending order by largest_seqno, so we
+    // only need to check the first one.
+    if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
+      break;
+    }
+  }
+  if (start >= level_files.size()) {
+    return false;
+  }
+  size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
+  uint64_t compensated_compact_bytes =
+      level_files[start]->compensated_file_size;
   size_t compact_bytes_per_del_file = port::kMaxSizet;
-  // compaction range will be [0, span_len).
-  size_t span_len;
-  // pull in files until the amount of compaction work per deleted file begins
-  // increasing.
+  // Compaction range will be [start, limit).
+  size_t limit;
+  // Pull in files until the amount of compaction work per deleted file begins
+  // increasing or maximum total compaction size is reached.
   size_t new_compact_bytes_per_del_file = 0;
-  for (span_len = 1; span_len < level_files.size(); ++span_len) {
-    compact_bytes += static_cast<size_t>(level_files[span_len]->fd.file_size);
-    new_compact_bytes_per_del_file = compact_bytes / span_len;
-    if (level_files[span_len]->being_compacted ||
-        new_compact_bytes_per_del_file > compact_bytes_per_del_file) {
+  for (limit = start + 1; limit < level_files.size(); ++limit) {
+    compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size);
+    compensated_compact_bytes += level_files[limit]->compensated_file_size;
+    new_compact_bytes_per_del_file = compact_bytes / (limit - start);
+    if (level_files[limit]->being_compacted ||
+        new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
+        compensated_compact_bytes > max_compaction_bytes) {
       break;
     }
     compact_bytes_per_del_file = new_compact_bytes_per_del_file;
   }
 
-  if (span_len >= min_files_to_compact &&
+  if ((limit - start) >= min_files_to_compact &&
       compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
     assert(comp_inputs != nullptr);
     comp_inputs->level = 0;
-    for (size_t i = 0; i < span_len; ++i) {
+    for (size_t i = start; i < limit; ++i) {
       comp_inputs->files.push_back(level_files[i]);
     }
     return true;
@@ -1107,536 +1128,4 @@ bool CompactionPicker::GetOverlappingL0Files(
   return true;
 }
 
-bool LevelCompactionPicker::NeedsCompaction(
-    const VersionStorageInfo* vstorage) const {
-  if (!vstorage->ExpiredTtlFiles().empty()) {
-    return true;
-  }
-  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
-    return true;
-  }
-  if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
-    return true;
-  }
-  if (!vstorage->FilesMarkedForCompaction().empty()) {
-    return true;
-  }
-  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
-    if (vstorage->CompactionScore(i) >= 1) {
-      return true;
-    }
-  }
-  return false;
-}
-
-namespace {
-// A class to build a leveled compaction step-by-step.
-class LevelCompactionBuilder {
- public:
-  LevelCompactionBuilder(const std::string& cf_name,
-                         VersionStorageInfo* vstorage,
-                         CompactionPicker* compaction_picker,
-                         LogBuffer* log_buffer,
-                         const MutableCFOptions& mutable_cf_options,
-                         const ImmutableCFOptions& ioptions)
-      : cf_name_(cf_name),
-        vstorage_(vstorage),
-        compaction_picker_(compaction_picker),
-        log_buffer_(log_buffer),
-        mutable_cf_options_(mutable_cf_options),
-        ioptions_(ioptions) {}
-
-  // Pick and return a compaction.
-  Compaction* PickCompaction();
-
-  // Pick the initial files to compact to the next level. (or together
-  // in Intra-L0 compactions)
-  void SetupInitialFiles();
-
-  // If the initial files are from L0 level, pick other L0
-  // files if needed.
-  bool SetupOtherL0FilesIfNeeded();
-
-  // Based on initial files, setup other files need to be compacted
-  // in this compaction, accordingly.
-  bool SetupOtherInputsIfNeeded();
-
-  Compaction* GetCompaction();
-
-  // For the specfied level, pick a file that we want to compact.
-  // Returns false if there is no file to compact.
-  // If it returns true, inputs->files.size() will be exactly one.
-  // If level is 0 and there is already a compaction on that level, this
-  // function will return false.
-  bool PickFileToCompact();
-
-  // For L0->L0, picks the longest span of files that aren't currently
-  // undergoing compaction for which work-per-deleted-file decreases. The span
-  // always starts from the newest L0 file.
-  //
-  // Intra-L0 compaction is independent of all other files, so it can be
-  // performed even when L0->base_level compactions are blocked.
-  //
-  // Returns true if `inputs` is populated with a span of files to be compacted;
-  // otherwise, returns false.
-  bool PickIntraL0Compaction();
-
-  void PickExpiredTtlFiles();
-
-  void PickFilesMarkedForPeriodicCompaction();
-
-  const std::string& cf_name_;
-  VersionStorageInfo* vstorage_;
-  CompactionPicker* compaction_picker_;
-  LogBuffer* log_buffer_;
-  int start_level_ = -1;
-  int output_level_ = -1;
-  int parent_index_ = -1;
-  int base_index_ = -1;
-  double start_level_score_ = 0;
-  bool is_manual_ = false;
-  CompactionInputFiles start_level_inputs_;
-  std::vector<CompactionInputFiles> compaction_inputs_;
-  CompactionInputFiles output_level_inputs_;
-  std::vector<FileMetaData*> grandparents_;
-  CompactionReason compaction_reason_ = CompactionReason::kUnknown;
-
-  const MutableCFOptions& mutable_cf_options_;
-  const ImmutableCFOptions& ioptions_;
-  // Pick a path ID to place a newly generated file, with its level
-  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
-                            const MutableCFOptions& mutable_cf_options,
-                            int level);
-
-  static const int kMinFilesForIntraL0Compaction = 4;
-};
-
-void LevelCompactionBuilder::PickExpiredTtlFiles() {
-  if (vstorage_->ExpiredTtlFiles().empty()) {
-    return;
-  }
-
-  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
-    // If it's being compacted it has nothing to do here.
-    // If this assert() fails that means that some function marked some
-    // files as being_compacted, but didn't call ComputeCompactionScore()
-    assert(!level_file.second->being_compacted);
-    start_level_ = level_file.first;
-    output_level_ =
-        (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
-
-    if ((start_level_ == vstorage_->num_non_empty_levels() - 1) ||
-        (start_level_ == 0 &&
-         !compaction_picker_->level0_compactions_in_progress()->empty())) {
-      return false;
-    }
-
-    start_level_inputs_.files = {level_file.second};
-    start_level_inputs_.level = start_level_;
-    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                      &start_level_inputs_);
-  };
-
-  for (auto& level_file : vstorage_->ExpiredTtlFiles()) {
-    if (continuation(level_file)) {
-      // found the compaction!
-      return;
-    }
-  }
-
-  start_level_inputs_.files.clear();
-}
-
-void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() {
-  if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
-    return;
-  }
-
-  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
-    // If it's being compacted it has nothing to do here.
-    // If this assert() fails that means that some function marked some
-    // files as being_compacted, but didn't call ComputeCompactionScore()
-    assert(!level_file.second->being_compacted);
-    output_level_ = start_level_ = level_file.first;
-
-    if (start_level_ == 0 &&
-        !compaction_picker_->level0_compactions_in_progress()->empty()) {
-      return false;
-    }
-
-    start_level_inputs_.files = {level_file.second};
-    start_level_inputs_.level = start_level_;
-    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                      &start_level_inputs_);
-  };
-
-  for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) {
-    if (continuation(level_file)) {
-      // found the compaction!
-      return;
-    }
-  }
-
-  start_level_inputs_.files.clear();
-}
-
-void LevelCompactionBuilder::SetupInitialFiles() {
-  // Find the compactions by size on all levels.
-  bool skipped_l0_to_base = false;
-  for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
-    start_level_score_ = vstorage_->CompactionScore(i);
-    start_level_ = vstorage_->CompactionScoreLevel(i);
-    assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
-    if (start_level_score_ >= 1) {
-      if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
-        // If L0->base_level compaction is pending, don't schedule further
-        // compaction from base level. Otherwise L0->base_level compaction
-        // may starve.
-        continue;
-      }
-      output_level_ =
-          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
-      if (PickFileToCompact()) {
-        // found the compaction!
-        if (start_level_ == 0) {
-          // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
-          compaction_reason_ = CompactionReason::kLevelL0FilesNum;
-        } else {
-          // L1+ score = `Level files size` / `MaxBytesForLevel`
-          compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
-        }
-        break;
-      } else {
-        // didn't find the compaction, clear the inputs
-        start_level_inputs_.clear();
-        if (start_level_ == 0) {
-          skipped_l0_to_base = true;
-          // L0->base_level may be blocked due to ongoing L0->base_level
-          // compactions. It may also be blocked by an ongoing compaction from
-          // base_level downwards.
-          //
-          // In these cases, to reduce L0 file count and thus reduce likelihood
-          // of write stalls, we can attempt compacting a span of files within
-          // L0.
-          if (PickIntraL0Compaction()) {
-            output_level_ = 0;
-            compaction_reason_ = CompactionReason::kLevelL0FilesNum;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  // if we didn't find a compaction, check if there are any files marked for
-  // compaction
-  if (start_level_inputs_.empty()) {
-    parent_index_ = base_index_ = -1;
-
-    compaction_picker_->PickFilesMarkedForCompaction(
-        cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
-    if (!start_level_inputs_.empty()) {
-      is_manual_ = true;
-      compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
-      return;
-    }
-  }
-
-  // Bottommost Files Compaction on deleting tombstones
-  if (start_level_inputs_.empty()) {
-    size_t i;
-    for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size();
-         ++i) {
-      auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i];
-      assert(!level_and_file.second->being_compacted);
-      start_level_inputs_.level = output_level_ = start_level_ =
-          level_and_file.first;
-      start_level_inputs_.files = {level_and_file.second};
-      if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                     &start_level_inputs_)) {
-        break;
-      }
-    }
-    if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) {
-      start_level_inputs_.clear();
-    } else {
-      assert(!start_level_inputs_.empty());
-      compaction_reason_ = CompactionReason::kBottommostFiles;
-      return;
-    }
-  }
-
-  // TTL Compaction
-  if (start_level_inputs_.empty()) {
-    PickExpiredTtlFiles();
-    if (!start_level_inputs_.empty()) {
-      compaction_reason_ = CompactionReason::kTtl;
-      return;
-    }
-  }
-
-  // Periodic Compaction
-  if (start_level_inputs_.empty()) {
-    PickFilesMarkedForPeriodicCompaction();
-    if (!start_level_inputs_.empty()) {
-      compaction_reason_ = CompactionReason::kPeriodicCompaction;
-      return;
-    }
-  }
-}
-
-bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
-  if (start_level_ == 0 && output_level_ != 0) {
-    return compaction_picker_->GetOverlappingL0Files(
-        vstorage_, &start_level_inputs_, output_level_, &parent_index_);
-  }
-  return true;
-}
-
-bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
-  // Setup input files from output level. For output to L0, we only compact
-  // spans of files that do not interact with any pending compactions, so don't
-  // need to consider other levels.
-  if (output_level_ != 0) {
-    output_level_inputs_.level = output_level_;
-    if (!compaction_picker_->SetupOtherInputs(
-            cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
-            &output_level_inputs_, &parent_index_, base_index_)) {
-      return false;
-    }
-
-    compaction_inputs_.push_back(start_level_inputs_);
-    if (!output_level_inputs_.empty()) {
-      compaction_inputs_.push_back(output_level_inputs_);
-    }
-
-    // In some edge cases we could pick a compaction that will be compacting
-    // a key range that overlap with another running compaction, and both
-    // of them have the same output level. This could happen if
-    // (1) we are running a non-exclusive manual compaction
-    // (2) AddFile ingest a new file into the LSM tree
-    // We need to disallow this from happening.
-    if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_,
-                                                            output_level_)) {
-      // This compaction output could potentially conflict with the output
-      // of a currently running compaction, we cannot run it.
-      return false;
-    }
-    compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
-                                        output_level_inputs_, &grandparents_);
-  } else {
-    compaction_inputs_.push_back(start_level_inputs_);
-  }
-  return true;
-}
-
-Compaction* LevelCompactionBuilder::PickCompaction() {
-  // Pick up the first file to start compaction. It may have been extended
-  // to a clean cut.
-  SetupInitialFiles();
-  if (start_level_inputs_.empty()) {
-    return nullptr;
-  }
-  assert(start_level_ >= 0 && output_level_ >= 0);
-
-  // If it is a L0 -> base level compaction, we need to set up other L0
-  // files if needed.
-  if (!SetupOtherL0FilesIfNeeded()) {
-    return nullptr;
-  }
-
-  // Pick files in the output level and expand more files in the start level
-  // if needed.
-  if (!SetupOtherInputsIfNeeded()) {
-    return nullptr;
-  }
-
-  // Form a compaction object containing the files we picked.
-  Compaction* c = GetCompaction();
-
-  TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
-
-  return c;
-}
-
-Compaction* LevelCompactionBuilder::GetCompaction() {
-  auto c = new Compaction(
-      vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_),
-      output_level_,
-      MaxFileSizeForLevel(mutable_cf_options_, output_level_,
-                          ioptions_.compaction_style, vstorage_->base_level(),
-                          ioptions_.level_compaction_dynamic_level_bytes),
-      mutable_cf_options_.max_compaction_bytes,
-      GetPathId(ioptions_, mutable_cf_options_, output_level_),
-      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
-                         output_level_, vstorage_->base_level()),
-      GetCompressionOptions(ioptions_, vstorage_, output_level_),
-      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
-      start_level_score_, false /* deletion_compaction */, compaction_reason_);
-
-  // If it's level 0 compaction, make sure we don't execute any other level 0
-  // compactions in parallel
-  compaction_picker_->RegisterCompaction(c);
-
-  // Creating a compaction influences the compaction score because the score
-  // takes running compactions into account (by skipping files that are already
-  // being compacted). Since we just changed compaction score, we recalculate it
-  // here
-  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
-  return c;
-}
-
-/*
- * Find the optimal path to place a file
- * Given a level, finds the path where levels up to it will fit in levels
- * up to and including this path
- */
-uint32_t LevelCompactionBuilder::GetPathId(
-    const ImmutableCFOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options, int level) {
-  uint32_t p = 0;
-  assert(!ioptions.cf_paths.empty());
-
-  // size remaining in the most recent path
-  uint64_t current_path_size = ioptions.cf_paths[0].target_size;
-
-  uint64_t level_size;
-  int cur_level = 0;
-
-  // max_bytes_for_level_base denotes L1 size.
-  // We estimate L0 size to be the same as L1.
-  level_size = mutable_cf_options.max_bytes_for_level_base;
-
-  // Last path is the fallback
-  while (p < ioptions.cf_paths.size() - 1) {
-    if (level_size <= current_path_size) {
-      if (cur_level == level) {
-        // Does desired level fit in this path?
-        return p;
-      } else {
-        current_path_size -= level_size;
-        if (cur_level > 0) {
-          if (ioptions.level_compaction_dynamic_level_bytes) {
-            // Currently, level_compaction_dynamic_level_bytes is ignored when
-            // multiple db paths are specified. https://github.com/facebook/
-            // rocksdb/blob/master/db/column_family.cc.
-            // Still, adding this check to avoid accidentally using
-            // max_bytes_for_level_multiplier_additional
-            level_size = static_cast<uint64_t>(
-                level_size * mutable_cf_options.max_bytes_for_level_multiplier);
-          } else {
-            level_size = static_cast<uint64_t>(
-                level_size * mutable_cf_options.max_bytes_for_level_multiplier *
-                mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
-          }
-        }
-        cur_level++;
-        continue;
-      }
-    }
-    p++;
-    current_path_size = ioptions.cf_paths[p].target_size;
-  }
-  return p;
-}
-
-bool LevelCompactionBuilder::PickFileToCompact() {
-  // level 0 files are overlapping. So we cannot pick more
-  // than one concurrent compactions at this level. This
-  // could be made better by looking at key-ranges that are
-  // being compacted at level 0.
-  if (start_level_ == 0 &&
-      !compaction_picker_->level0_compactions_in_progress()->empty()) {
-    TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
-    return false;
-  }
-
-  start_level_inputs_.clear();
-
-  assert(start_level_ >= 0);
-
-  // Pick the largest file in this level that is not already
-  // being compacted
-  const std::vector<int>& file_size =
-      vstorage_->FilesByCompactionPri(start_level_);
-  const std::vector<FileMetaData*>& level_files =
-      vstorage_->LevelFiles(start_level_);
-
-  unsigned int cmp_idx;
-  for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
-       cmp_idx < file_size.size(); cmp_idx++) {
-    int index = file_size[cmp_idx];
-    auto* f = level_files[index];
-
-    // do not pick a file to compact if it is being compacted
-    // from n-1 level.
-    if (f->being_compacted) {
-      continue;
-    }
-
-    start_level_inputs_.files.push_back(f);
-    start_level_inputs_.level = start_level_;
-    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                    &start_level_inputs_) ||
-        compaction_picker_->FilesRangeOverlapWithCompaction(
-            {start_level_inputs_}, output_level_)) {
-      // A locked (pending compaction) input-level file was pulled in due to
-      // user-key overlap.
-      start_level_inputs_.clear();
-      continue;
-    }
-
-    // Now that input level is fully expanded, we check whether any output files
-    // are locked due to pending compaction.
-    //
-    // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
-    // level files are locked, not just the extra ones pulled in for user-key
-    // overlap.
-    InternalKey smallest, largest;
-    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
-    CompactionInputFiles output_level_inputs;
-    output_level_inputs.level = output_level_;
-    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
-                                    &output_level_inputs.files);
-    if (!output_level_inputs.empty() &&
-        !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
-                                                    &output_level_inputs)) {
-      start_level_inputs_.clear();
-      continue;
-    }
-    base_index_ = index;
-    break;
-  }
-
-  // store where to start the iteration in the next call to PickCompaction
-  vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
-
-  return start_level_inputs_.size() > 0;
-}
-
-bool LevelCompactionBuilder::PickIntraL0Compaction() {
-  start_level_inputs_.clear();
-  const std::vector<FileMetaData*>& level_files =
-      vstorage_->LevelFiles(0 /* level */);
-  if (level_files.size() <
-          static_cast<size_t>(
-              mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
-      level_files[0]->being_compacted) {
-    // If L0 isn't accumulating much files beyond the regular trigger, don't
-    // resort to L0->L0 compaction yet.
-    return false;
-  }
-  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
-                               port::kMaxUint64, &start_level_inputs_);
-}
-}  // namespace
-
-Compaction* LevelCompactionPicker::PickCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
-                                 mutable_cf_options, ioptions_);
-  return builder.PickCompaction();
-}
-
 }  // namespace rocksdb
diff --git a/db/compaction_picker.h b/db/compaction/compaction_picker.h
similarity index 80%
rename from db/compaction_picker.h
rename to db/compaction/compaction_picker.h
index 01f5495e67b..ae158059a1b 100644
--- a/db/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -15,7 +15,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "db/compaction.h"
+#include "db/compaction/compaction.h"
 #include "db/version_set.h"
 #include "options/cf_options.h"
 #include "rocksdb/env.h"
@@ -24,11 +24,26 @@
 
 namespace rocksdb {
 
+// The file contains an abstract class CompactionPicker, and its two
+// sub-classes LevelCompactionPicker and NullCompactionPicker, as
+// well as some helper functions used by them.
+
 class LogBuffer;
 class Compaction;
 class VersionStorageInfo;
 struct CompactionInputFiles;
 
+// An abstract class to pick compactions from an existing LSM-tree.
+//
+// Each compaction style inherits the class and implement the
+// interface to form automatic compactions. If NeedCompaction() is true,
+// then call PickCompaction() to find what files need to be compacted
+// and where to put the output files.
+//
+// Non-virtual functions CompactRange() and CompactFiles() are used to
+// pick files to compact based on users' DB::CompactRange() and
+// DB::CompactFiles() requests, respectively. There is little
+// compaction style specific logic for them.
 class CompactionPicker {
  public:
   CompactionPicker(const ImmutableCFOptions& ioptions,
@@ -39,10 +54,10 @@ class CompactionPicker {
   // Returns nullptr if there is no compaction to be done.
   // Otherwise returns a pointer to a heap-allocated object that
   // describes the compaction.  Caller should delete the result.
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* vstorage,
-                                     LogBuffer* log_buffer) = 0;
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
 
   // Return a compaction object for compacting the range [begin,end] in
   // the specified level.  Returns nullptr if there is nothing in that
@@ -221,21 +236,9 @@ class CompactionPicker {
   const InternalKeyComparator* const icmp_;
 };
 
-class LevelCompactionPicker : public CompactionPicker {
- public:
-  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
-                        const InternalKeyComparator* icmp)
-      : CompactionPicker(ioptions, icmp) {}
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* vstorage,
-                                     LogBuffer* log_buffer) override;
-
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage) const override;
-};
-
 #ifndef ROCKSDB_LITE
+// A dummy compaction that never triggers any automatic
+// compaction.
 class NullCompactionPicker : public CompactionPicker {
  public:
   NullCompactionPicker(const ImmutableCFOptions& ioptions,
@@ -244,10 +247,11 @@ class NullCompactionPicker : public CompactionPicker {
   virtual ~NullCompactionPicker() {}
 
   // Always return "nullptr"
-  Compaction* PickCompaction(const std::string& /*cf_name*/,
-                             const MutableCFOptions& /*mutable_cf_options*/,
-                             VersionStorageInfo* /*vstorage*/,
-                             LogBuffer* /*log_buffer*/) override {
+  Compaction* PickCompaction(
+      const std::string& /*cf_name*/,
+      const MutableCFOptions& /*mutable_cf_options*/,
+      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+      SequenceNumber /* earliest_memtable_seqno */) override {
     return nullptr;
   }
 
@@ -273,10 +277,27 @@ class NullCompactionPicker : public CompactionPicker {
 };
 #endif  // !ROCKSDB_LITE
 
-bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
-                           size_t min_files_to_compact,
-                           uint64_t max_compact_bytes_per_del_file,
-                           CompactionInputFiles* comp_inputs);
+// Attempts to find an intra L0 compaction conforming to the given parameters.
+//
+// @param level_files                     Metadata for L0 files.
+// @param min_files_to_compact            Minimum number of files required to
+//                                        do the compaction.
+// @param max_compact_bytes_per_del_file  Maximum average size in bytes per
+//                                        file that is going to get deleted by
+//                                        the compaction.
+// @param max_compaction_bytes            Maximum total size in bytes (in terms
+//                                        of compensated file size) for files
+//                                        to be compacted.
+// @param [out] comp_inputs               If a compaction was found, will be
+//                                        initialized with corresponding input
+//                                        files. Cannot be nullptr.
+//
+// @return                                true iff compaction was found.
+bool FindIntraL0Compaction(
+    const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+    uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+    CompactionInputFiles* comp_inputs,
+    SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
 
 CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
                                    const VersionStorageInfo* vstorage,
diff --git a/db/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
similarity index 93%
rename from db/compaction_picker_fifo.cc
rename to db/compaction/compaction_picker_fifo.cc
index 1322989e568..cdf5e46dab6 100644
--- a/db/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -7,18 +7,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_fifo.h"
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 #include <vector>
 #include "db/column_family.h"
-#include "util/log_buffer.h"
+#include "logging/log_buffer.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -58,6 +54,15 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
   }
   const uint64_t current_time = static_cast<uint64_t>(_current_time);
 
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
   std::vector<CompactionInputFiles> inputs;
   inputs.emplace_back();
   inputs[0].level = 0;
@@ -134,7 +139,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
               mutable_cf_options
                   .level0_file_num_compaction_trigger /* min_files_to_compact */
               ,
-              max_compact_bytes_per_del_file, &comp_inputs)) {
+              max_compact_bytes_per_del_file,
+              mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
         Compaction* c = new Compaction(
             vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
             16 * 1024 * 1024 /* output file size limit */,
@@ -196,7 +202,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
 
 Compaction* FIFOCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+    SequenceNumber /*earliest_memtable_seqno*/) {
   assert(vstorage->num_levels() == 1);
 
   Compaction* c = nullptr;
diff --git a/db/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
similarity index 86%
rename from db/compaction_picker_fifo.h
rename to db/compaction/compaction_picker_fifo.h
index 9da107c5d4a..065faef1398 100644
--- a/db/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -10,7 +10,7 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
-#include "db/compaction_picker.h"
+#include "db/compaction/compaction_picker.h"
 
 namespace rocksdb {
 class FIFOCompactionPicker : public CompactionPicker {
@@ -19,10 +19,10 @@ class FIFOCompactionPicker : public CompactionPicker {
                        const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
 
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* version,
-                                     LogBuffer* log_buffer) override;
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* version, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
 
   virtual Compaction* CompactRange(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
new file mode 100644
index 00000000000..4c2afa66709
--- /dev/null
+++ b/db/compaction/compaction_picker_level.cc
@@ -0,0 +1,558 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/compaction/compaction_picker_level.h"
+#include "logging/log_buffer.h"
+#include "test_util/sync_point.h"
+
+namespace rocksdb {
+
+bool LevelCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  if (!vstorage->ExpiredTtlFiles().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+    if (vstorage->CompactionScore(i) >= 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+namespace {
+// A class to build a leveled compaction step-by-step.
+class LevelCompactionBuilder {
+ public:
+  LevelCompactionBuilder(const std::string& cf_name,
+                         VersionStorageInfo* vstorage,
+                         SequenceNumber earliest_mem_seqno,
+                         CompactionPicker* compaction_picker,
+                         LogBuffer* log_buffer,
+                         const MutableCFOptions& mutable_cf_options,
+                         const ImmutableCFOptions& ioptions)
+      : cf_name_(cf_name),
+        vstorage_(vstorage),
+        earliest_mem_seqno_(earliest_mem_seqno),
+        compaction_picker_(compaction_picker),
+        log_buffer_(log_buffer),
+        mutable_cf_options_(mutable_cf_options),
+        ioptions_(ioptions) {}
+
+  // Pick and return a compaction.
+  Compaction* PickCompaction();
+
+  // Pick the initial files to compact to the next level. (or together
+  // in Intra-L0 compactions)
+  void SetupInitialFiles();
+
+  // If the initial files are from L0 level, pick other L0
+  // files if needed.
+  bool SetupOtherL0FilesIfNeeded();
+
+  // Based on initial files, setup other files need to be compacted
+  // in this compaction, accordingly.
+  bool SetupOtherInputsIfNeeded();
+
+  Compaction* GetCompaction();
+
+  // For the specfied level, pick a file that we want to compact.
+  // Returns false if there is no file to compact.
+  // If it returns true, inputs->files.size() will be exactly one.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return false.
+  bool PickFileToCompact();
+
+  // For L0->L0, picks the longest span of files that aren't currently
+  // undergoing compaction for which work-per-deleted-file decreases. The span
+  // always starts from the newest L0 file.
+  //
+  // Intra-L0 compaction is independent of all other files, so it can be
+  // performed even when L0->base_level compactions are blocked.
+  //
+  // Returns true if `inputs` is populated with a span of files to be compacted;
+  // otherwise, returns false.
+  bool PickIntraL0Compaction();
+
+  void PickExpiredTtlFiles();
+
+  void PickFilesMarkedForPeriodicCompaction();
+
+  const std::string& cf_name_;
+  VersionStorageInfo* vstorage_;
+  SequenceNumber earliest_mem_seqno_;
+  CompactionPicker* compaction_picker_;
+  LogBuffer* log_buffer_;
+  int start_level_ = -1;
+  int output_level_ = -1;
+  int parent_index_ = -1;
+  int base_index_ = -1;
+  double start_level_score_ = 0;
+  bool is_manual_ = false;
+  CompactionInputFiles start_level_inputs_;
+  std::vector<CompactionInputFiles> compaction_inputs_;
+  CompactionInputFiles output_level_inputs_;
+  std::vector<FileMetaData*> grandparents_;
+  CompactionReason compaction_reason_ = CompactionReason::kUnknown;
+
+  const MutableCFOptions& mutable_cf_options_;
+  const ImmutableCFOptions& ioptions_;
+  // Pick a path ID to place a newly generated file, with its level
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            int level);
+
+  static const int kMinFilesForIntraL0Compaction = 4;
+};
+
+void LevelCompactionBuilder::PickExpiredTtlFiles() {
+  if (vstorage_->ExpiredTtlFiles().empty()) {
+    return;
+  }
+
+  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    start_level_ = level_file.first;
+    output_level_ =
+        (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+
+    if ((start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+        (start_level_ == 0 &&
+         !compaction_picker_->level0_compactions_in_progress()->empty())) {
+      return false;
+    }
+
+    start_level_inputs_.files = {level_file.second};
+    start_level_inputs_.level = start_level_;
+    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                      &start_level_inputs_);
+  };
+
+  for (auto& level_file : vstorage_->ExpiredTtlFiles()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+
+  start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::PickFilesMarkedForPeriodicCompaction() {
+  if (vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+    return;
+  }
+
+  auto continuation = [&](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    output_level_ = start_level_ = level_file.first;
+
+    if (start_level_ == 0 &&
+        !compaction_picker_->level0_compactions_in_progress()->empty()) {
+      return false;
+    }
+
+    start_level_inputs_.files = {level_file.second};
+    start_level_inputs_.level = start_level_;
+    return compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                      &start_level_inputs_);
+  };
+
+  for (auto& level_file : vstorage_->FilesMarkedForPeriodicCompaction()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+
+  start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::SetupInitialFiles() {
+  // Find the compactions by size on all levels.
+  bool skipped_l0_to_base = false;
+  for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
+    start_level_score_ = vstorage_->CompactionScore(i);
+    start_level_ = vstorage_->CompactionScoreLevel(i);
+    assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
+    if (start_level_score_ >= 1) {
+      if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
+        // If L0->base_level compaction is pending, don't schedule further
+        // compaction from base level. Otherwise L0->base_level compaction
+        // may starve.
+        continue;
+      }
+      output_level_ =
+          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+      if (PickFileToCompact()) {
+        // found the compaction!
+        if (start_level_ == 0) {
+          // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
+          compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+        } else {
+          // L1+ score = `Level files size` / `MaxBytesForLevel`
+          compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
+        }
+        break;
+      } else {
+        // didn't find the compaction, clear the inputs
+        start_level_inputs_.clear();
+        if (start_level_ == 0) {
+          skipped_l0_to_base = true;
+          // L0->base_level may be blocked due to ongoing L0->base_level
+          // compactions. It may also be blocked by an ongoing compaction from
+          // base_level downwards.
+          //
+          // In these cases, to reduce L0 file count and thus reduce likelihood
+          // of write stalls, we can attempt compacting a span of files within
+          // L0.
+          if (PickIntraL0Compaction()) {
+            output_level_ = 0;
+            compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // if we didn't find a compaction, check if there are any files marked for
+  // compaction
+  if (start_level_inputs_.empty()) {
+    parent_index_ = base_index_ = -1;
+
+    compaction_picker_->PickFilesMarkedForCompaction(
+        cf_name_, vstorage_, &start_level_, &output_level_,
+        &start_level_inputs_);
+    if (!start_level_inputs_.empty()) {
+      is_manual_ = true;
+      compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+      return;
+    }
+  }
+
+  // Bottommost Files Compaction on deleting tombstones
+  if (start_level_inputs_.empty()) {
+    size_t i;
+    for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size();
+         ++i) {
+      auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i];
+      assert(!level_and_file.second->being_compacted);
+      start_level_inputs_.level = output_level_ = start_level_ =
+          level_and_file.first;
+      start_level_inputs_.files = {level_and_file.second};
+      if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                     &start_level_inputs_)) {
+        break;
+      }
+    }
+    if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) {
+      start_level_inputs_.clear();
+    } else {
+      assert(!start_level_inputs_.empty());
+      compaction_reason_ = CompactionReason::kBottommostFiles;
+      return;
+    }
+  }
+
+  // TTL Compaction
+  if (start_level_inputs_.empty()) {
+    PickExpiredTtlFiles();
+    if (!start_level_inputs_.empty()) {
+      compaction_reason_ = CompactionReason::kTtl;
+      return;
+    }
+  }
+
+  // Periodic Compaction
+  if (start_level_inputs_.empty()) {
+    PickFilesMarkedForPeriodicCompaction();
+    if (!start_level_inputs_.empty()) {
+      compaction_reason_ = CompactionReason::kPeriodicCompaction;
+      return;
+    }
+  }
+}
+
+bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
+  if (start_level_ == 0 && output_level_ != 0) {
+    return compaction_picker_->GetOverlappingL0Files(
+        vstorage_, &start_level_inputs_, output_level_, &parent_index_);
+  }
+  return true;
+}
+
+bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
+  // Setup input files from output level. For output to L0, we only compact
+  // spans of files that do not interact with any pending compactions, so don't
+  // need to consider other levels.
+  if (output_level_ != 0) {
+    output_level_inputs_.level = output_level_;
+    if (!compaction_picker_->SetupOtherInputs(
+            cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
+            &output_level_inputs_, &parent_index_, base_index_)) {
+      return false;
+    }
+
+    compaction_inputs_.push_back(start_level_inputs_);
+    if (!output_level_inputs_.empty()) {
+      compaction_inputs_.push_back(output_level_inputs_);
+    }
+
+    // In some edge cases we could pick a compaction that will be compacting
+    // a key range that overlap with another running compaction, and both
+    // of them have the same output level. This could happen if
+    // (1) we are running a non-exclusive manual compaction
+    // (2) AddFile ingest a new file into the LSM tree
+    // We need to disallow this from happening.
+    if (compaction_picker_->FilesRangeOverlapWithCompaction(compaction_inputs_,
+                                                            output_level_)) {
+      // This compaction output could potentially conflict with the output
+      // of a currently running compaction, we cannot run it.
+      return false;
+    }
+    compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
+                                        output_level_inputs_, &grandparents_);
+  } else {
+    compaction_inputs_.push_back(start_level_inputs_);
+  }
+  return true;
+}
+
+Compaction* LevelCompactionBuilder::PickCompaction() {
+  // Pick up the first file to start compaction. It may have been extended
+  // to a clean cut.
+  SetupInitialFiles();
+  if (start_level_inputs_.empty()) {
+    return nullptr;
+  }
+  assert(start_level_ >= 0 && output_level_ >= 0);
+
+  // If it is a L0 -> base level compaction, we need to set up other L0
+  // files if needed.
+  if (!SetupOtherL0FilesIfNeeded()) {
+    return nullptr;
+  }
+
+  // Pick files in the output level and expand more files in the start level
+  // if needed.
+  if (!SetupOtherInputsIfNeeded()) {
+    return nullptr;
+  }
+
+  // Form a compaction object containing the files we picked.
+  Compaction* c = GetCompaction();
+
+  TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
+
+  return c;
+}
+
+Compaction* LevelCompactionBuilder::GetCompaction() {
+  auto c = new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_),
+      output_level_,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+                          ioptions_.compaction_style, vstorage_->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
+      mutable_cf_options_.max_compaction_bytes,
+      GetPathId(ioptions_, mutable_cf_options_, output_level_),
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+                         output_level_, vstorage_->base_level()),
+      GetCompressionOptions(ioptions_, vstorage_, output_level_),
+      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+      start_level_score_, false /* deletion_compaction */, compaction_reason_);
+
+  // If it's level 0 compaction, make sure we don't execute any other level 0
+  // compactions in parallel
+  compaction_picker_->RegisterCompaction(c);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  return c;
+}
+
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionBuilder::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, int level) {
+  uint32_t p = 0;
+  assert(!ioptions.cf_paths.empty());
+
+  // size remaining in the most recent path
+  uint64_t current_path_size = ioptions.cf_paths[0].target_size;
+
+  uint64_t level_size;
+  int cur_level = 0;
+
+  // max_bytes_for_level_base denotes L1 size.
+  // We estimate L0 size to be the same as L1.
+  level_size = mutable_cf_options.max_bytes_for_level_base;
+
+  // Last path is the fallback
+  while (p < ioptions.cf_paths.size() - 1) {
+    if (level_size <= current_path_size) {
+      if (cur_level == level) {
+        // Does desired level fit in this path?
+        return p;
+      } else {
+        current_path_size -= level_size;
+        if (cur_level > 0) {
+          if (ioptions.level_compaction_dynamic_level_bytes) {
+            // Currently, level_compaction_dynamic_level_bytes is ignored when
+            // multiple db paths are specified. https://github.com/facebook/
+            // rocksdb/blob/master/db/column_family.cc.
+            // Still, adding this check to avoid accidentally using
+            // max_bytes_for_level_multiplier_additional
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+          } else {
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+                mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+          }
+        }
+        cur_level++;
+        continue;
+      }
+    }
+    p++;
+    current_path_size = ioptions.cf_paths[p].target_size;
+  }
+  return p;
+}
+
+bool LevelCompactionBuilder::PickFileToCompact() {
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (start_level_ == 0 &&
+      !compaction_picker_->level0_compactions_in_progress()->empty()) {
+    TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
+    return false;
+  }
+
+  start_level_inputs_.clear();
+
+  assert(start_level_ >= 0);
+
+  // Pick the largest file in this level that is not already
+  // being compacted
+  const std::vector<int>& file_size =
+      vstorage_->FilesByCompactionPri(start_level_);
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(start_level_);
+
+  unsigned int cmp_idx;
+  for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+       cmp_idx < file_size.size(); cmp_idx++) {
+    int index = file_size[cmp_idx];
+    auto* f = level_files[index];
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      continue;
+    }
+
+    start_level_inputs_.files.push_back(f);
+    start_level_inputs_.level = start_level_;
+    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &start_level_inputs_) ||
+        compaction_picker_->FilesRangeOverlapWithCompaction(
+            {start_level_inputs_}, output_level_)) {
+      // A locked (pending compaction) input-level file was pulled in due to
+      // user-key overlap.
+      start_level_inputs_.clear();
+      continue;
+    }
+
+    // Now that input level is fully expanded, we check whether any output files
+    // are locked due to pending compaction.
+    //
+    // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+    // level files are locked, not just the extra ones pulled in for user-key
+    // overlap.
+    InternalKey smallest, largest;
+    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+    CompactionInputFiles output_level_inputs;
+    output_level_inputs.level = output_level_;
+    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                    &output_level_inputs.files);
+    if (!output_level_inputs.empty() &&
+        !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &output_level_inputs)) {
+      start_level_inputs_.clear();
+      continue;
+    }
+    base_index_ = index;
+    break;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
+
+  return start_level_inputs_.size() > 0;
+}
+
+bool LevelCompactionBuilder::PickIntraL0Compaction() {
+  start_level_inputs_.clear();
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(0 /* level */);
+  if (level_files.size() <
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
+      level_files[0]->being_compacted) {
+    // If L0 isn't accumulating much files beyond the regular trigger, don't
+    // resort to L0->L0 compaction yet.
+    return false;
+  }
+  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
+                               port::kMaxUint64,
+                               mutable_cf_options_.max_compaction_bytes,
+                               &start_level_inputs_, earliest_mem_seqno_);
+}
+}  // namespace
+
+Compaction* LevelCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+    SequenceNumber earliest_mem_seqno) {
+  LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
+                                 log_buffer, mutable_cf_options, ioptions_);
+  return builder.PickCompaction();
+}
+}  // namespace rocksdb
diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h
new file mode 100644
index 00000000000..c8d905ef90f
--- /dev/null
+++ b/db/compaction/compaction_picker_level.h
@@ -0,0 +1,32 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction_picker.h"
+
+namespace rocksdb {
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+};
+
+}  // namespace rocksdb
diff --git a/db/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
similarity index 83%
rename from db/compaction_picker_test.cc
rename to db/compaction/compaction_picker_test.cc
index 31325c12893..5cb3350d648 100644
--- a/db/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -3,19 +3,19 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/compaction_picker.h"
 
 #include <limits>
 #include <string>
 #include <utility>
-#include "db/compaction.h"
-#include "db/compaction_picker_fifo.h"
-#include "db/compaction_picker_universal.h"
-
-#include "util/logging.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -57,6 +57,8 @@ class CompactionPickerTest : public testing::Test {
         log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
         file_num_(1),
         vstorage_(nullptr) {
+    mutable_cf_options_.ttl = 0;
+    mutable_cf_options_.periodic_compaction_seconds = 0;
     // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
     // tests to cover.
     ioptions_.compaction_pri = kByCompensatedSize;
@@ -88,15 +90,14 @@ class CompactionPickerTest : public testing::Test {
            SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
            size_t compensated_file_size = 0) {
     assert(level < vstorage_->num_levels());
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(file_number, path_id, file_size);
-    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
-    f->largest = InternalKey(largest, largest_seq, kTypeValue);
-    f->fd.smallest_seqno = smallest_seq;
-    f->fd.largest_seqno = largest_seq;
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size,
+        InternalKey(smallest, smallest_seq, kTypeValue),
+        InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime);
     f->compensated_file_size =
         (compensated_file_size != 0) ? compensated_file_size : file_size;
-    f->refs = 0;
     vstorage_->AddFile(level, f);
     files_.emplace_back(f);
     file_map_.insert({file_number, {f, level}});
@@ -501,6 +502,168 @@ TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
   ASSERT_TRUE(compaction->is_trivial_move());
 }
 
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
+  // The case where universal periodic compaction can be picked
+  // with some newer files being compacted.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
+  // The case where universal periodic compaction does not
+  // pick up only level to compact if it doesn't cover
+  // any file marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
+  // The case where universal periodic compaction does not
+  // pick up only the last sorted run which is an L0 file if it isn't
+  // marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(0, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
+  // The case where universal periodic compaction couldn't form
+  // a compaction that inlcudes any file marked for periodic compaction.
+  // Right now we form the compaction anyway if it is more than one
+  // sorted run. Just put the case here to validate that it doesn't
+  // crash.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(!compaction ||
+              compaction->start_level() != compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
+  // Test single L0 file periodic compaction triggering.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 6U, "150", "200", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
+  // Test single sorted run non-L0 periodic compaction
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(4, 5U, "150", "200", kFileSize, 0, 500, 550);
+  Add(4, 6U, "350", "400", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
 TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
   NewVersionStorage(1, kCompactionStyleFIFO);
   const int kFileCount =
@@ -1478,6 +1641,97 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
   ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
 }
 
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
+  // spans entire L0 key range and is marked as being compacted to avoid
+  // L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 110, 111);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 5 L0 files will be picked for intra L0 compaction due to
+  // max_compaction_bytes limit (the minimum number of files for triggering
+  // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
+  // is marked as being compacted to avoid L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 109, 110);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 6 L0 files will be picked for intra L0 compaction due to
+  // being_compact limit. And the latest one L0 will be skipped due to earliest
+  // seqno. The one L1 file spans entire L0 key range and is marked as being
+  // compacted to avoid L0->L1 compaction.
+  Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
+  Add(0, 2U, "301", "350", 1U, 0, 108, 109);
+  Add(0, 3U, "251", "300", 1U, 0, 106, 107);
+  Add(0, 4U, "201", "250", 1U, 0, 104, 105);
+  Add(0, 5U, "151", "200", 1U, 0, 102, 103);
+  Add(0, 6U, "100", "150", 1U, 0, 100, 101);
+  Add(0, 7U, "100", "100", 1U, 0, 99, 100);
+  vstorage_->LevelFiles(0)[5]->being_compacted = true;
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_, 107));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
similarity index 57%
rename from db/compaction_picker_universal.cc
rename to db/compaction/compaction_picker_universal.cc
index 9291178585a..473a480cbc2 100644
--- a/db/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -7,28 +7,129 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/compaction_picker_universal.h"
+#include "db/compaction/compaction_picker_universal.h"
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <limits>
 #include <queue>
 #include <string>
 #include <utility>
 #include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
 #include "monitoring/statistics.h"
-#include "util/filename.h"
-#include "util/log_buffer.h"
+#include "test_util/sync_point.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 namespace {
+// A helper class that form universal compactions. The class is used by
+// UniversalCompactionPicker::PickCompaction().
+// The usage is to create the class, and get the compaction object by calling
+// PickCompaction().
+class UniversalCompactionBuilder {
+ public:
+  UniversalCompactionBuilder(const ImmutableCFOptions& ioptions,
+                             const InternalKeyComparator* icmp,
+                             const std::string& cf_name,
+                             const MutableCFOptions& mutable_cf_options,
+                             VersionStorageInfo* vstorage,
+                             UniversalCompactionPicker* picker,
+                             LogBuffer* log_buffer)
+      : ioptions_(ioptions),
+        icmp_(icmp),
+        cf_name_(cf_name),
+        mutable_cf_options_(mutable_cf_options),
+        vstorage_(vstorage),
+        picker_(picker),
+        log_buffer_(log_buffer) {}
+
+  // Form and return the compaction object. The caller owns return object.
+  Compaction* PickCompaction();
+
+ private:
+  struct SortedRun {
+    SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+              uint64_t _compensated_file_size, bool _being_compacted)
+        : level(_level),
+          file(_file),
+          size(_size),
+          compensated_file_size(_compensated_file_size),
+          being_compacted(_being_compacted) {
+      assert(compensated_file_size > 0);
+      assert(level != 0 || file != nullptr);
+    }
+
+    void Dump(char* out_buf, size_t out_buf_size,
+              bool print_path = false) const;
+
+    // sorted_run_count is added into the string to print
+    void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+                      size_t sorted_run_count) const;
+
+    int level;
+    // `file` Will be null for level > 0. For level = 0, the sorted run is
+    // for this file.
+    FileMetaData* file;
+    // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+    // files in the level. `being_compacted` should be the same for all files
+    // in a non-zero level. Use the value here.
+    uint64_t size;
+    uint64_t compensated_file_size;
+    bool being_compacted;
+  };
+
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionToReduceSortedRuns(
+      unsigned int ratio, unsigned int max_number_of_files_to_compact);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionToReduceSizeAmp();
+
+  Compaction* PickDeleteTriggeredCompaction();
+
+  // Form a compaction from the sorted run indicated by start_index to the
+  // oldest sorted run.
+  // The caller is responsible for making sure that those files are not in
+  // compaction.
+  Compaction* PickCompactionToOldest(size_t start_index,
+                                     CompactionReason compaction_reason);
+
+  // Try to pick periodic compaction. The caller should only call it
+  // if there is at least one file marked for periodic compaction.
+  // null will be returned if no such a compaction can be formed
+  // because some files are being compacted.
+  Compaction* PickPeriodicCompaction();
+
+  // Used in universal compaction when the enabled_trivial_move
+  // option is set. Checks whether there are any overlapping files
+  // in the input. Returns true if the input files are non
+  // overlapping.
+  bool IsInputFilesNonOverlapping(Compaction* c);
+
+  const ImmutableCFOptions& ioptions_;
+  const InternalKeyComparator* icmp_;
+  double score_;
+  std::vector<SortedRun> sorted_runs_;
+  const std::string& cf_name_;
+  const MutableCFOptions& mutable_cf_options_;
+  VersionStorageInfo* vstorage_;
+  UniversalCompactionPicker* picker_;
+  LogBuffer* log_buffer_;
+
+  static std::vector<SortedRun> CalculateSortedRuns(
+      const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
+      const MutableCFOptions& mutable_cf_options);
+
+  // Pick a path ID to place a newly generated file, with its estimated file
+  // size.
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            uint64_t file_size);
+};
+
 // Used in universal compaction when trivial move is enabled.
 // This structure is used for the construction of min heap
 // that contains the file meta data, the level of the file
@@ -117,7 +218,7 @@ void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
 
 // Algorithm that checks to see if there are any overlapping
 // files in the input
-bool UniversalCompactionPicker::IsInputFilesNonOverlapping(Compaction* c) {
+bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
   auto comparator = icmp_->user_comparator();
   int first_iter = 1;
 
@@ -165,15 +266,28 @@ bool UniversalCompactionPicker::NeedsCompaction(
   if (vstorage->CompactionScore(kLevel0) >= 1) {
     return true;
   }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
   if (!vstorage->FilesMarkedForCompaction().empty()) {
     return true;
   }
   return false;
 }
 
-void UniversalCompactionPicker::SortedRun::Dump(char* out_buf,
-                                                size_t out_buf_size,
-                                                bool print_path) const {
+Compaction* UniversalCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+    SequenceNumber /* earliest_memtable_seqno */) {
+  UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
+                                     mutable_cf_options, vstorage, this,
+                                     log_buffer);
+  return builder.PickCompaction();
+}
+
+void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
+                                                 size_t out_buf_size,
+                                                 bool print_path) const {
   if (level == 0) {
     assert(file != nullptr);
     if (file->fd.GetPathId() == 0 || !print_path) {
@@ -189,7 +303,7 @@ void UniversalCompactionPicker::SortedRun::Dump(char* out_buf,
   }
 }
 
-void UniversalCompactionPicker::SortedRun::DumpSizeInfo(
+void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
     char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
   if (level == 0) {
     assert(file != nullptr);
@@ -208,11 +322,11 @@ void UniversalCompactionPicker::SortedRun::DumpSizeInfo(
   }
 }
 
-std::vector<UniversalCompactionPicker::SortedRun>
-UniversalCompactionPicker::CalculateSortedRuns(
+std::vector<UniversalCompactionBuilder::SortedRun>
+UniversalCompactionBuilder::CalculateSortedRuns(
     const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/,
     const MutableCFOptions& mutable_cf_options) {
-  std::vector<UniversalCompactionPicker::SortedRun> ret;
+  std::vector<UniversalCompactionBuilder::SortedRun> ret;
   for (FileMetaData* f : vstorage.LevelFiles(0)) {
     ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
                      f->being_compacted);
@@ -235,8 +349,8 @@ UniversalCompactionPicker::CalculateSortedRuns(
         // non-zero level, all the files should share the same being_compacted
         // value.
         // This assumption is only valid when
-        // mutable_cf_options.compaction_options_universal.allow_trivial_move is
-        // false
+        // mutable_cf_options.compaction_options_universal.allow_trivial_move
+        // is false
         assert(is_first || f->being_compacted == being_compacted);
       }
       if (is_first) {
@@ -254,65 +368,68 @@ UniversalCompactionPicker::CalculateSortedRuns(
 
 // Universal style of compaction. Pick files that are contiguous in
 // time-range to compact.
-Compaction* UniversalCompactionPicker::PickCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+Compaction* UniversalCompactionBuilder::PickCompaction() {
   const int kLevel0 = 0;
-  double score = vstorage->CompactionScore(kLevel0);
-  std::vector<SortedRun> sorted_runs =
-      CalculateSortedRuns(*vstorage, ioptions_, mutable_cf_options);
-
-  if (sorted_runs.size() == 0 ||
-      (vstorage->FilesMarkedForCompaction().empty() &&
-       sorted_runs.size() < (unsigned int)mutable_cf_options
-                                .level0_file_num_compaction_trigger)) {
-    ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: nothing to do\n",
-                     cf_name.c_str());
-    TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return",
-                             nullptr);
+  score_ = vstorage_->CompactionScore(kLevel0);
+  sorted_runs_ =
+      CalculateSortedRuns(*vstorage_, ioptions_, mutable_cf_options_);
+
+  if (sorted_runs_.size() == 0 ||
+      (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
+       vstorage_->FilesMarkedForCompaction().empty() &&
+       sorted_runs_.size() < (unsigned int)mutable_cf_options_
+                                 .level0_file_num_compaction_trigger)) {
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
+                     cf_name_.c_str());
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
     return nullptr;
   }
   VersionStorageInfo::LevelSummaryStorage tmp;
   ROCKS_LOG_BUFFER_MAX_SZ(
-      log_buffer, 3072,
+      log_buffer_, 3072,
       "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n",
-      cf_name.c_str(), sorted_runs.size(), vstorage->LevelSummary(&tmp));
+      cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
 
-  // Check for size amplification first.
   Compaction* c = nullptr;
-  if (sorted_runs.size() >=
-      static_cast<size_t>(
-          mutable_cf_options.level0_file_num_compaction_trigger)) {
-    if ((c = PickCompactionToReduceSizeAmp(cf_name, mutable_cf_options,
-                                           vstorage, score, sorted_runs,
-                                           log_buffer)) != nullptr) {
-      ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: compacting for size amp\n",
-                       cf_name.c_str());
+  // Periodic compaction has higher priority than other type of compaction
+  // because it's a hard requirement.
+  if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+    // Always need to do a full compaction for periodic compaction.
+    c = PickPeriodicCompaction();
+  }
+
+  // Check for size amplification.
+  if (c == nullptr &&
+      sorted_runs_.size() >=
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger)) {
+    if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
+                       cf_name_.c_str());
     } else {
       // Size amplification is within limits. Try reducing read
       // amplification while maintaining file size ratios.
       unsigned int ratio =
-          mutable_cf_options.compaction_options_universal.size_ratio;
+          mutable_cf_options_.compaction_options_universal.size_ratio;
 
-      if ((c = PickCompactionToReduceSortedRuns(
-               cf_name, mutable_cf_options, vstorage, score, ratio, UINT_MAX,
-               sorted_runs, log_buffer)) != nullptr) {
-        ROCKS_LOG_BUFFER(log_buffer,
+      if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+        ROCKS_LOG_BUFFER(log_buffer_,
                          "[%s] Universal: compacting for size ratio\n",
-                         cf_name.c_str());
+                         cf_name_.c_str());
       } else {
         // Size amplification and file size ratios are within configured limits.
         // If max read amplification is exceeding configured limits, then force
         // compaction without looking at filesize ratios and try to reduce
         // the number of files to fewer than level0_file_num_compaction_trigger.
         // This is guaranteed by NeedsCompaction()
-        assert(sorted_runs.size() >=
+        assert(sorted_runs_.size() >=
                static_cast<size_t>(
-                   mutable_cf_options.level0_file_num_compaction_trigger));
+                   mutable_cf_options_.level0_file_num_compaction_trigger));
         // Get the total number of sorted runs that are not being compacted
         int num_sr_not_compacted = 0;
-        for (size_t i = 0; i < sorted_runs.size(); i++) {
-          if (sorted_runs[i].being_compacted == false) {
+        for (size_t i = 0; i < sorted_runs_.size(); i++) {
+          if (sorted_runs_[i].being_compacted == false) {
             num_sr_not_compacted++;
           }
         }
@@ -320,16 +437,15 @@ Compaction* UniversalCompactionPicker::PickCompaction(
         // The number of sorted runs that are not being compacted is greater
         // than the maximum allowed number of sorted runs
         if (num_sr_not_compacted >
-            mutable_cf_options.level0_file_num_compaction_trigger) {
+            mutable_cf_options_.level0_file_num_compaction_trigger) {
           unsigned int num_files =
               num_sr_not_compacted -
-              mutable_cf_options.level0_file_num_compaction_trigger + 1;
-          if ((c = PickCompactionToReduceSortedRuns(
-                   cf_name, mutable_cf_options, vstorage, score, UINT_MAX,
-                   num_files, sorted_runs, log_buffer)) != nullptr) {
-            ROCKS_LOG_BUFFER(log_buffer,
+              mutable_cf_options_.level0_file_num_compaction_trigger + 1;
+          if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
+              nullptr) {
+            ROCKS_LOG_BUFFER(log_buffer_,
                              "[%s] Universal: compacting for file num -- %u\n",
-                             cf_name.c_str(), num_files);
+                             cf_name_.c_str(), num_files);
           }
         }
       }
@@ -337,23 +453,22 @@ Compaction* UniversalCompactionPicker::PickCompaction(
   }
 
   if (c == nullptr) {
-    if ((c = PickDeleteTriggeredCompaction(cf_name, mutable_cf_options,
-                                           vstorage, score, sorted_runs,
-                                           log_buffer)) != nullptr) {
-      ROCKS_LOG_BUFFER(log_buffer,
+    if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_,
                        "[%s] Universal: delete triggered compaction\n",
-                       cf_name.c_str());
+                       cf_name_.c_str());
     }
   }
 
   if (c == nullptr) {
-    TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return",
-                             nullptr);
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
     return nullptr;
   }
 
-  if (mutable_cf_options.compaction_options_universal.allow_trivial_move ==
-      true) {
+  if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
+          true &&
+      c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
     c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
   }
 
@@ -398,15 +513,15 @@ Compaction* UniversalCompactionPicker::PickCompaction(
   RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
                     c->inputs(0)->size());
 
-  RegisterCompaction(c);
-  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+  picker_->RegisterCompaction(c);
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
 
-  TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return",
+  TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
                            c);
   return c;
 }
 
-uint32_t UniversalCompactionPicker::GetPathId(
+uint32_t UniversalCompactionBuilder::GetPathId(
     const ImmutableCFOptions& ioptions,
     const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
   // Two conditions need to be satisfied:
@@ -444,15 +559,12 @@ uint32_t UniversalCompactionPicker::GetPathId(
 // Consider compaction files based on their size differences with
 // the next file in time order.
 //
-Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, double score, unsigned int ratio,
-    unsigned int max_number_of_files_to_compact,
-    const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer) {
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
+    unsigned int ratio, unsigned int max_number_of_files_to_compact) {
   unsigned int min_merge_width =
-      mutable_cf_options.compaction_options_universal.min_merge_width;
+      mutable_cf_options_.compaction_options_universal.min_merge_width;
   unsigned int max_merge_width =
-      mutable_cf_options.compaction_options_universal.max_merge_width;
+      mutable_cf_options_.compaction_options_universal.max_merge_width;
 
   const SortedRun* sr = nullptr;
   bool done = false;
@@ -466,16 +578,16 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
   // Caller checks the size before executing this function. This invariant is
   // important because otherwise we may have a possible integer underflow when
   // dealing with unsigned types.
-  assert(sorted_runs.size() > 0);
+  assert(sorted_runs_.size() > 0);
 
   // Considers a candidate file only if it is smaller than the
   // total size accumulated so far.
-  for (size_t loop = 0; loop < sorted_runs.size(); loop++) {
+  for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
     candidate_count = 0;
 
     // Skip files that are already being compacted
-    for (sr = nullptr; loop < sorted_runs.size(); loop++) {
-      sr = &sorted_runs[loop];
+    for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
+      sr = &sorted_runs_[loop];
 
       if (!sr->being_compacted) {
         candidate_count = 1;
@@ -483,10 +595,10 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
       }
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf));
-      ROCKS_LOG_BUFFER(log_buffer,
+      ROCKS_LOG_BUFFER(log_buffer_,
                        "[%s] Universal: %s"
                        "[%d] being compacted, skipping",
-                       cf_name.c_str(), file_num_buf, loop);
+                       cf_name_.c_str(), file_num_buf, loop);
 
       sr = nullptr;
     }
@@ -497,15 +609,16 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
     if (sr != nullptr) {
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf), true);
-      ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: Possible candidate %s[%d].",
-                       cf_name.c_str(), file_num_buf, loop);
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Possible candidate %s[%d].",
+                       cf_name_.c_str(), file_num_buf, loop);
     }
 
     // Check if the succeeding files need compaction.
     for (size_t i = loop + 1;
-         candidate_count < max_files_to_compact && i < sorted_runs.size();
+         candidate_count < max_files_to_compact && i < sorted_runs_.size();
          i++) {
-      const SortedRun* succeeding_sr = &sorted_runs[i];
+      const SortedRun* succeeding_sr = &sorted_runs_[i];
       if (succeeding_sr->being_compacted) {
         break;
       }
@@ -519,7 +632,7 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
       if (sz < static_cast<double>(succeeding_sr->size)) {
         break;
       }
-      if (mutable_cf_options.compaction_options_universal.stop_style ==
+      if (mutable_cf_options_.compaction_options_universal.stop_style ==
           kCompactionStopStyleSimilarSize) {
         // Similar-size stopping rule: also check the last picked file isn't
         // far larger than the next candidate file.
@@ -545,12 +658,12 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
       break;
     } else {
       for (size_t i = loop;
-           i < loop + candidate_count && i < sorted_runs.size(); i++) {
-        const SortedRun* skipping_sr = &sorted_runs[i];
+           i < loop + candidate_count && i < sorted_runs_.size(); i++) {
+        const SortedRun* skipping_sr = &sorted_runs_[i];
         char file_num_buf[256];
         skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
-        ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: Skipping %s",
-                         cf_name.c_str(), file_num_buf);
+        ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
+                         cf_name_.c_str(), file_num_buf);
       }
     }
   }
@@ -562,16 +675,16 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
   // size ratio of compression.
   bool enable_compression = true;
   int ratio_to_compress =
-      mutable_cf_options.compaction_options_universal.compression_size_percent;
+      mutable_cf_options_.compaction_options_universal.compression_size_percent;
   if (ratio_to_compress >= 0) {
     uint64_t total_size = 0;
-    for (auto& sorted_run : sorted_runs) {
+    for (auto& sorted_run : sorted_runs_) {
       total_size += sorted_run.compensated_file_size;
     }
 
     uint64_t older_file_size = 0;
-    for (size_t i = sorted_runs.size() - 1; i >= first_index_after; i--) {
-      older_file_size += sorted_runs[i].size;
+    for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
+      older_file_size += sorted_runs_[i].size;
       if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
         enable_compression = false;
         break;
@@ -581,46 +694,46 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
 
   uint64_t estimated_total_size = 0;
   for (unsigned int i = 0; i < first_index_after; i++) {
-    estimated_total_size += sorted_runs[i].size;
+    estimated_total_size += sorted_runs_[i].size;
   }
   uint32_t path_id =
-      GetPathId(ioptions_, mutable_cf_options, estimated_total_size);
-  int start_level = sorted_runs[start_index].level;
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
   int output_level;
-  if (first_index_after == sorted_runs.size()) {
-    output_level = vstorage->num_levels() - 1;
-  } else if (sorted_runs[first_index_after].level == 0) {
+  if (first_index_after == sorted_runs_.size()) {
+    output_level = vstorage_->num_levels() - 1;
+  } else if (sorted_runs_[first_index_after].level == 0) {
     output_level = 0;
   } else {
-    output_level = sorted_runs[first_index_after].level - 1;
+    output_level = sorted_runs_[first_index_after].level - 1;
   }
 
   // last level is reserved for the files ingested behind
   if (ioptions_.allow_ingest_behind &&
-      (output_level == vstorage->num_levels() - 1)) {
+      (output_level == vstorage_->num_levels() - 1)) {
     assert(output_level > 1);
     output_level--;
   }
 
-  std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
   for (size_t i = 0; i < inputs.size(); ++i) {
     inputs[i].level = start_level + static_cast<int>(i);
   }
   for (size_t i = start_index; i < first_index_after; i++) {
-    auto& picking_sr = sorted_runs[i];
+    auto& picking_sr = sorted_runs_[i];
     if (picking_sr.level == 0) {
       FileMetaData* picking_file = picking_sr.file;
       inputs[0].files.push_back(picking_file);
     } else {
       auto& files = inputs[picking_sr.level - start_level].files;
-      for (auto* f : vstorage->LevelFiles(picking_sr.level)) {
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
         files.push_back(f);
       }
     }
     char file_num_buf[256];
     picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
-    ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: Picking %s", cf_name.c_str(),
-                     file_num_buf);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
+                     cf_name_.c_str(), file_num_buf);
   }
 
   CompactionReason compaction_reason;
@@ -630,16 +743,17 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
     compaction_reason = CompactionReason::kUniversalSortedRunNum;
   }
   return new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level,
-      MaxFileSizeForLevel(mutable_cf_options, output_level,
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
                           kCompactionStyleUniversal),
       LLONG_MAX, path_id,
-      GetCompressionType(ioptions_, vstorage, mutable_cf_options, start_level,
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
                          1, enable_compression),
-      GetCompressionOptions(ioptions_, vstorage, start_level,
+      GetCompressionOptions(ioptions_, vstorage_, start_level,
                             enable_compression),
       /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
-      score, false /* deletion_compaction */, compaction_reason);
+      score_, false /* deletion_compaction */, compaction_reason);
 }
 
 // Look at overall size amplification. If size amplification
@@ -648,12 +762,9 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
 // base file (overrides configured values of file-size ratios,
 // min_merge_width and max_merge_width).
 //
-Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, double score,
-    const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer) {
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
   // percentage flexibility while reducing size amplification
-  uint64_t ratio = mutable_cf_options.compaction_options_universal
+  uint64_t ratio = mutable_cf_options_.compaction_options_universal
                        .max_size_amplification_percent;
 
   unsigned int candidate_count = 0;
@@ -661,21 +772,23 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
   size_t start_index = 0;
   const SortedRun* sr = nullptr;
 
-  if (sorted_runs.back().being_compacted) {
+  assert(!sorted_runs_.empty());
+  if (sorted_runs_.back().being_compacted) {
     return nullptr;
   }
 
   // Skip files that are already being compacted
-  for (size_t loop = 0; loop < sorted_runs.size() - 1; loop++) {
-    sr = &sorted_runs[loop];
+  for (size_t loop = 0; loop < sorted_runs_.size() - 1; loop++) {
+    sr = &sorted_runs_[loop];
     if (!sr->being_compacted) {
       start_index = loop;  // Consider this as the first candidate.
       break;
     }
     char file_num_buf[kFormatFileNumberBufSize];
     sr->Dump(file_num_buf, sizeof(file_num_buf), true);
-    ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: skipping %s[%d] compacted %s",
-                     cf_name.c_str(), file_num_buf, loop,
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Universal: skipping %s[%d] compacted %s",
+                     cf_name_.c_str(), file_num_buf, loop,
                      " cannot be a candidate to reduce size amp.\n");
     sr = nullptr;
   }
@@ -687,20 +800,20 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
     char file_num_buf[kFormatFileNumberBufSize];
     sr->Dump(file_num_buf, sizeof(file_num_buf), true);
     ROCKS_LOG_BUFFER(
-        log_buffer,
+        log_buffer_,
         "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
-        cf_name.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
+        cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
   }
 
   // keep adding up all the remaining files
-  for (size_t loop = start_index; loop < sorted_runs.size() - 1; loop++) {
-    sr = &sorted_runs[loop];
+  for (size_t loop = start_index; loop < sorted_runs_.size() - 1; loop++) {
+    sr = &sorted_runs_[loop];
     if (sr->being_compacted) {
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf), true);
       ROCKS_LOG_BUFFER(
-          log_buffer, "[%s] Universal: Possible candidate %s[%d] %s",
-          cf_name.c_str(), file_num_buf, start_index,
+          log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
+          cf_name_.c_str(), file_num_buf, start_index,
           " is already being compacted. No size amp reduction possible.\n");
       return nullptr;
     }
@@ -712,88 +825,35 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
   }
 
   // size of earliest file
-  uint64_t earliest_file_size = sorted_runs.back().size;
+  uint64_t earliest_file_size = sorted_runs_.back().size;
 
   // size amplification = percentage of additional size
   if (candidate_size * 100 < ratio * earliest_file_size) {
     ROCKS_LOG_BUFFER(
-        log_buffer,
+        log_buffer_,
         "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
         " earliest-file-size %" PRIu64,
-        cf_name.c_str(), candidate_size, earliest_file_size);
+        cf_name_.c_str(), candidate_size, earliest_file_size);
     return nullptr;
   } else {
     ROCKS_LOG_BUFFER(
-        log_buffer,
+        log_buffer_,
         "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
         " earliest-file-size %" PRIu64,
-        cf_name.c_str(), candidate_size, earliest_file_size);
-  }
-  assert(start_index < sorted_runs.size() - 1);
-
-  // Estimate total file size
-  uint64_t estimated_total_size = 0;
-  for (size_t loop = start_index; loop < sorted_runs.size(); loop++) {
-    estimated_total_size += sorted_runs[loop].size;
-  }
-  uint32_t path_id =
-      GetPathId(ioptions_, mutable_cf_options, estimated_total_size);
-  int start_level = sorted_runs[start_index].level;
-
-  std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    inputs[i].level = start_level + static_cast<int>(i);
+        cf_name_.c_str(), candidate_size, earliest_file_size);
   }
-  // We always compact all the files, so always compress.
-  for (size_t loop = start_index; loop < sorted_runs.size(); loop++) {
-    auto& picking_sr = sorted_runs[loop];
-    if (picking_sr.level == 0) {
-      FileMetaData* f = picking_sr.file;
-      inputs[0].files.push_back(f);
-    } else {
-      auto& files = inputs[picking_sr.level - start_level].files;
-      for (auto* f : vstorage->LevelFiles(picking_sr.level)) {
-        files.push_back(f);
-      }
-    }
-    char file_num_buf[256];
-    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
-    ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: size amp picking %s",
-                     cf_name.c_str(), file_num_buf);
-  }
-
-  // output files at the bottom most level, unless it's reserved
-  int output_level = vstorage->num_levels() - 1;
-  // last level is reserved for the files ingested behind
-  if (ioptions_.allow_ingest_behind) {
-    assert(output_level > 1);
-    output_level--;
-  }
-
-  return new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level,
-      MaxFileSizeForLevel(mutable_cf_options, output_level,
-                          kCompactionStyleUniversal),
-      /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
-      GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
-                         1),
-      GetCompressionOptions(ioptions_, vstorage, output_level),
-      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
-      score, false /* deletion_compaction */,
-      CompactionReason::kUniversalSizeAmplification);
+  return PickCompactionToOldest(start_index,
+                                CompactionReason::kUniversalSizeAmplification);
 }
 
 // Pick files marked for compaction. Typically, files are marked by
 // CompactOnDeleteCollector due to the presence of tombstones.
-Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, double score,
-    const std::vector<SortedRun>& /*sorted_runs*/, LogBuffer* /*log_buffer*/) {
+Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
   CompactionInputFiles start_level_inputs;
   int output_level;
   std::vector<CompactionInputFiles> inputs;
 
-  if (vstorage->num_levels() == 1) {
+  if (vstorage_->num_levels() == 1) {
     // This is single level universal. Since we're basically trying to reclaim
     // space by processing files marked for compaction due to high tombstone
     // density, let's do the same thing as compaction to reduce size amp which
@@ -803,7 +863,7 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
     start_level_inputs.level = 0;
     start_level_inputs.files.clear();
     output_level = 0;
-    for (FileMetaData* f : vstorage->LevelFiles(0)) {
+    for (FileMetaData* f : vstorage_->LevelFiles(0)) {
       if (f->marked_for_compaction) {
         compact = true;
       }
@@ -822,24 +882,24 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
     // For multi-level universal, the strategy is to make this look more like
     // leveled. We pick one of the files marked for compaction and compact with
     // overlapping files in the adjacent level.
-    PickFilesMarkedForCompaction(cf_name, vstorage, &start_level, &output_level,
-                                 &start_level_inputs);
+    picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
+                                          &output_level, &start_level_inputs);
     if (start_level_inputs.empty()) {
       return nullptr;
     }
 
     // Pick the first non-empty level after the start_level
-    for (output_level = start_level + 1; output_level < vstorage->num_levels();
+    for (output_level = start_level + 1; output_level < vstorage_->num_levels();
          output_level++) {
-      if (vstorage->NumLevelFiles(output_level) != 0) {
+      if (vstorage_->NumLevelFiles(output_level) != 0) {
         break;
       }
     }
 
     // If all higher levels are empty, pick the highest level as output level
-    if (output_level == vstorage->num_levels()) {
+    if (output_level == vstorage_->num_levels()) {
       if (start_level == 0) {
-        output_level = vstorage->num_levels() - 1;
+        output_level = vstorage_->num_levels() - 1;
       } else {
         // If start level is non-zero and all higher levels are empty, this
         // compaction will translate into a trivial move. Since the idea is
@@ -849,15 +909,15 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
       }
     }
     if (ioptions_.allow_ingest_behind &&
-        output_level == vstorage->num_levels() - 1) {
+        output_level == vstorage_->num_levels() - 1) {
       assert(output_level > 1);
       output_level--;
     }
 
     if (output_level != 0) {
       if (start_level == 0) {
-        if (!GetOverlappingL0Files(vstorage, &start_level_inputs, output_level,
-                                   nullptr)) {
+        if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
+                                            output_level, nullptr)) {
           return nullptr;
         }
       }
@@ -866,16 +926,16 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
       int parent_index = -1;
 
       output_level_inputs.level = output_level;
-      if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage,
-                            &start_level_inputs, &output_level_inputs,
-                            &parent_index, -1)) {
+      if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+                                     &start_level_inputs, &output_level_inputs,
+                                     &parent_index, -1)) {
         return nullptr;
       }
       inputs.push_back(start_level_inputs);
       if (!output_level_inputs.empty()) {
         inputs.push_back(output_level_inputs);
       }
-      if (FilesRangeOverlapWithCompaction(inputs, output_level)) {
+      if (picker_->FilesRangeOverlapWithCompaction(inputs, output_level)) {
         return nullptr;
       }
     } else {
@@ -885,23 +945,160 @@ Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
 
   uint64_t estimated_total_size = 0;
   // Use size of the output level as estimated file size
-  for (FileMetaData* f : vstorage->LevelFiles(output_level)) {
+  for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
     estimated_total_size += f->fd.GetFileSize();
   }
   uint32_t path_id =
-      GetPathId(ioptions_, mutable_cf_options, estimated_total_size);
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
   return new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level,
-      MaxFileSizeForLevel(mutable_cf_options, output_level,
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
                           kCompactionStyleUniversal),
       /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
-      GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
-                         1),
-      GetCompressionOptions(ioptions_, vstorage, output_level),
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
+                         output_level, 1),
+      GetCompressionOptions(ioptions_, vstorage_, output_level),
       /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true,
-      score, false /* deletion_compaction */,
+      score_, false /* deletion_compaction */,
       CompactionReason::kFilesMarkedForCompaction);
 }
+
+Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
+    size_t start_index, CompactionReason compaction_reason) {
+  assert(start_index < sorted_runs_.size());
+
+  // Estimate total file size
+  uint64_t estimated_total_size = 0;
+  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+    estimated_total_size += sorted_runs_[loop].size;
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
+
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
+  for (size_t loop = start_index; loop < sorted_runs_.size(); loop++) {
+    auto& picking_sr = sorted_runs_[loop];
+    if (picking_sr.level == 0) {
+      FileMetaData* f = picking_sr.file;
+      inputs[0].files.push_back(f);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    std::string comp_reason_print_string;
+    if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+      comp_reason_print_string = "periodic compaction";
+    } else if (compaction_reason ==
+               CompactionReason::kUniversalSizeAmplification) {
+      comp_reason_print_string = "size amp";
+    } else {
+      assert(false);
+    }
+
+    char file_num_buf[256];
+    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
+                     cf_name_.c_str(), comp_reason_print_string.c_str(),
+                     file_num_buf);
+  }
+
+  // output files at the bottom most level, unless it's reserved
+  int output_level = vstorage_->num_levels() - 1;
+  // last level is reserved for the files ingested behind
+  if (ioptions_.allow_ingest_behind) {
+    assert(output_level > 1);
+    output_level--;
+  }
+
+  // We never check size for
+  // compaction_options_universal.compression_size_percent,
+  // because we always compact all the files, so always compress.
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, std::move(inputs),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      LLONG_MAX, path_id,
+      GetCompressionType(ioptions_, vstorage_, mutable_cf_options_, start_level,
+                         1, true /* enable_compression */),
+      GetCompressionOptions(ioptions_, vstorage_, start_level,
+                            true /* enable_compression */),
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      score_, false /* deletion_compaction */, compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
+  ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
+                   cf_name_.c_str());
+
+  // In universal compaction, sorted runs contain older data are almost always
+  // generated earlier too. To simplify the problem, we just try to trigger
+  // a full compaction. We start from the oldest sorted run and include
+  // all sorted runs, until we hit a sorted already being compacted.
+  // Since usually the largest (which is usually the oldest) sorted run is
+  // included anyway, doing a full compaction won't increase write
+  // amplification much.
+
+  // Get some information from marked files to check whether a file is
+  // included in the compaction.
+
+  size_t start_index = sorted_runs_.size();
+  while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+    start_index--;
+  }
+  if (start_index == sorted_runs_.size()) {
+    return nullptr;
+  }
+
+  // There is a rare corner case where we can't pick up all the files
+  // because some files are being compacted and we end up with picking files
+  // but none of them need periodic compaction. Unless we simply recompact
+  // the last sorted run (either the last level or last L0 file), we would just
+  // execute the compaction, in order to simplify  the logic.
+  if (start_index == sorted_runs_.size() - 1) {
+    bool included_file_marked = false;
+    int start_level = sorted_runs_[start_index].level;
+    FileMetaData* start_file = sorted_runs_[start_index].file;
+    for (const std::pair<int, FileMetaData*>& level_file_pair :
+         vstorage_->FilesMarkedForPeriodicCompaction()) {
+      if (start_level != 0) {
+        // Last sorted run is a level
+        if (start_level == level_file_pair.first) {
+          included_file_marked = true;
+          break;
+        }
+      } else {
+        // Last sorted run is a L0 file.
+        if (start_file == level_file_pair.second) {
+          included_file_marked = true;
+          break;
+        }
+      }
+    }
+    if (!included_file_marked) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Cannot form a compaction covering file "
+                       "marked for periodic compaction",
+                       cf_name_.c_str());
+      return nullptr;
+    }
+  }
+
+  Compaction* c = PickCompactionToOldest(start_index,
+                                         CompactionReason::kPeriodicCompaction);
+
+  TEST_SYNC_POINT_CALLBACK(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
+
+  return c;
+}
 }  // namespace rocksdb
 
 #endif  // !ROCKSDB_LITE
diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h
new file mode 100644
index 00000000000..150b6bd79c1
--- /dev/null
+++ b/db/compaction/compaction_picker_universal.h
@@ -0,0 +1,31 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace rocksdb {
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+  UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
+                            const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+  virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+};
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/db/compaction_picker_universal.h b/db/compaction_picker_universal.h
deleted file mode 100644
index 375e5998e25..00000000000
--- a/db/compaction_picker_universal.h
+++ /dev/null
@@ -1,98 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include "db/compaction_picker.h"
-
-namespace rocksdb {
-class UniversalCompactionPicker : public CompactionPicker {
- public:
-  UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
-                            const InternalKeyComparator* icmp)
-      : CompactionPicker(ioptions, icmp) {}
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* vstorage,
-                                     LogBuffer* log_buffer) override;
-
-  virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
-
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage) const override;
-
- private:
-  struct SortedRun {
-    SortedRun(int _level, FileMetaData* _file, uint64_t _size,
-              uint64_t _compensated_file_size, bool _being_compacted)
-        : level(_level),
-          file(_file),
-          size(_size),
-          compensated_file_size(_compensated_file_size),
-          being_compacted(_being_compacted) {
-      assert(compensated_file_size > 0);
-      assert(level != 0 || file != nullptr);
-    }
-
-    void Dump(char* out_buf, size_t out_buf_size,
-              bool print_path = false) const;
-
-    // sorted_run_count is added into the string to print
-    void DumpSizeInfo(char* out_buf, size_t out_buf_size,
-                      size_t sorted_run_count) const;
-
-    int level;
-    // `file` Will be null for level > 0. For level = 0, the sorted run is
-    // for this file.
-    FileMetaData* file;
-    // For level > 0, `size` and `compensated_file_size` are sum of sizes all
-    // files in the level. `being_compacted` should be the same for all files
-    // in a non-zero level. Use the value here.
-    uint64_t size;
-    uint64_t compensated_file_size;
-    bool being_compacted;
-  };
-
-  // Pick Universal compaction to limit read amplification
-  Compaction* PickCompactionToReduceSortedRuns(
-      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, double score, unsigned int ratio,
-      unsigned int num_files, const std::vector<SortedRun>& sorted_runs,
-      LogBuffer* log_buffer);
-
-  // Pick Universal compaction to limit space amplification.
-  Compaction* PickCompactionToReduceSizeAmp(
-      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, double score,
-      const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer);
-
-  Compaction* PickDeleteTriggeredCompaction(
-      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, double score,
-      const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer);
-
-  // Used in universal compaction when the enabled_trivial_move
-  // option is set. Checks whether there are any overlapping files
-  // in the input. Returns true if the input files are non
-  // overlapping.
-  bool IsInputFilesNonOverlapping(Compaction* c);
-
-  static std::vector<SortedRun> CalculateSortedRuns(
-      const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
-      const MutableCFOptions& mutable_cf_options);
-
-  // Pick a path ID to place a newly generated file, with its estimated file
-  // size.
-  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
-                            const MutableCFOptions& mutable_cf_options,
-                            uint64_t file_size);
-};
-}  // namespace rocksdb
-#endif  // !ROCKSDB_LITE
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index a7ff587949d..de55c706ab7 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -9,11 +9,11 @@
 #include "memtable/stl_wrappers.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/hash.h"
 #include "util/kv_map.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 using std::unique_ptr;
diff --git a/db/convenience.cc b/db/convenience.cc
index 71c237f60c0..320d5c6e117 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -8,7 +8,7 @@
 
 #include "rocksdb/convenience.h"
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "util/cast_util.h"
 
 namespace rocksdb {
@@ -35,6 +35,12 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
 Status VerifySstFileChecksum(const Options& options,
                              const EnvOptions& env_options,
                              const std::string& file_path) {
+  return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path);
+}
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const ReadOptions& read_options,
+                             const std::string& file_path) {
   std::unique_ptr<RandomAccessFile> file;
   uint64_t file_size;
   InternalKeyComparator internal_comparator(options.comparator);
@@ -59,7 +65,8 @@ Status VerifySstFileChecksum(const Options& options,
   if (!s.ok()) {
     return s;
   }
-  s = table_reader->VerifyChecksum();
+  s = table_reader->VerifyChecksum(read_options,
+                                   TableReaderCaller::kUserVerifyChecksum);
   return s;
 }
 
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 1ccb1aa2b09..4add9b26a2f 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -13,23 +13,24 @@
 
 #include <errno.h>
 #include <fcntl.h>
-#include <inttypes.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include "db/db_impl.h"
+#include <cinttypes>
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "db/log_format.h"
 #include "db/version_set.h"
+#include "file/filename.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
-#include "table/block_based_table_builder.h"
+#include "table/block_based/block_based_table_builder.h"
 #include "table/meta_blocks.h"
-#include "util/filename.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -76,7 +77,11 @@ class CorruptionTest : public testing::Test {
     delete db_;
     db_ = nullptr;
     Options opt = (options ? *options : options_);
-    opt.env = &env_;
+    if (opt.env == Options().env) {
+      // If env is not overridden, replace it with ErrorEnv.
+      // Otherwise, the test already uses a non-default Env.
+      opt.env = &env_;
+    }
     opt.arena_block_size = 4096;
     BlockBasedTableOptions table_options;
     table_options.block_cache = tiny_cache_;
@@ -321,6 +326,59 @@ TEST_F(CorruptionTest, TableFile) {
   ASSERT_NOK(dbi->VerifyChecksum());
 }
 
+TEST_F(CorruptionTest, VerifyChecksumReadahead) {
+  Options options;
+  SpecialEnv senv(Env::Default());
+  options.env = &senv;
+  // Disable block cache as we are going to check checksum for
+  // the same file twice and measure number of reads.
+  BlockBasedTableOptions table_options_no_bc;
+  table_options_no_bc.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
+
+  Reopen(&options);
+
+  Build(10000);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_FlushMemTable();
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+  senv.count_random_reads_ = true;
+  senv.random_read_counter_.Reset();
+  ASSERT_OK(dbi->VerifyChecksum());
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read(), 0);
+
+  // The SST file is about 10MB. Default readahead size is 256KB.
+  // Give a conservative 20 reads for metadata blocks, The number
+  // of random reads should be within 10 MB / 256KB + 20 = 60.
+  ASSERT_LT(senv.random_read_counter_.Read(), 60);
+
+  senv.random_read_bytes_counter_ = 0;
+  ReadOptions ro;
+  ro.readahead_size = size_t{32 * 1024};
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+  // The SST file is about 10MB. We set readahead size to 32KB.
+  // Give 0 to 20 reads for metadata blocks, and allow real read
+  // to range from 24KB to 48KB. The lower bound would be:
+  //   10MB / 48KB + 0 = 213
+  // The higher bound is
+  //   10MB / 24KB + 20 = 447.
+  ASSERT_GE(senv.random_read_counter_.Read(), 213);
+  ASSERT_LE(senv.random_read_counter_.Read(), 447);
+
+  // Test readahead shouldn't break mmap mode (where it should be
+  // disabled).
+  options.allow_mmap_reads = true;
+  Reopen(&options);
+  dbi = static_cast<DBImpl*>(db_);
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+
+  CloseDb();
+}
+
 TEST_F(CorruptionTest, TableFileIndexData) {
   Options options;
   // very big, we'll trigger flushes manually
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 2d4487ff454..e964377cf9c 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -5,15 +5,15 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "table/cuckoo_table_factory.h"
-#include "table/cuckoo_table_reader.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
 #include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -285,6 +285,9 @@ TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
 TEST_F(CuckooTableDBTest, AdaptiveTable) {
   Options options = CurrentOptions();
 
+  // Ensure options compatible with PlainTable
+  options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+
   // Write some keys using cuckoo table.
   options.table_factory.reset(NewCuckooTableFactory());
   Reopen(&options);
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 236a534657f..601033d6125 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -10,9 +10,12 @@
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
-#include "util/fault_injection_test_env.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #endif
 
 namespace rocksdb {
@@ -68,6 +71,44 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
   ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
 }
 
+TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  auto options = CurrentOptions();
+  options.write_dbid_to_manifest = true;
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  std::string db_id1;
+  db_->GetDbIdentity(db_id1);
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  Flush();
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+  std::string db_id2;
+  db_->GetDbIdentity(db_id2);
+  ASSERT_EQ(db_id1, db_id2);
+}
+
 TEST_F(DBBasicTest, CompactedDB) {
   const uint64_t kFileSize = 1 << 20;
   Options options = CurrentOptions();
@@ -295,7 +336,7 @@ TEST_F(DBBasicTest, FlushMultipleMemtable) {
     writeOpt.disableWAL = true;
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
-    options.max_write_buffer_number_to_maintain = -1;
+    options.max_write_buffer_size_to_maintain = -1;
     CreateAndReopenWithCF({"pikachu"}, options);
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
     ASSERT_OK(Flush(1));
@@ -325,7 +366,8 @@ TEST_F(DBBasicTest, FlushEmptyColumnFamily) {
   writeOpt.disableWAL = true;
   options.max_write_buffer_number = 2;
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
   CreateAndReopenWithCF({"pikachu"}, options);
 
   // Compaction can still go through even if no thread can flush the
@@ -421,7 +463,7 @@ TEST_F(DBBasicTest, ManifestRollOver) {
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBBasicTest, IdentityAcrossRestarts) {
+TEST_F(DBBasicTest, IdentityAcrossRestarts1) {
   do {
     std::string id1;
     ASSERT_OK(db_->GetDbIdentity(id1));
@@ -433,13 +475,40 @@ TEST_F(DBBasicTest, IdentityAcrossRestarts) {
     // id1 should match id2 because identity was not regenerated
     ASSERT_EQ(id1.compare(id2), 0);
 
+    std::string idfilename = IdentityFileName(dbname_);
+    ASSERT_OK(env_->DeleteFile(idfilename));
+    Reopen(options);
+    std::string id3;
+    ASSERT_OK(db_->GetDbIdentity(id3));
+    if (options.write_dbid_to_manifest) {
+      ASSERT_EQ(id1.compare(id3), 0);
+    } else {
+      // id1 should NOT match id3 because identity was regenerated
+      ASSERT_NE(id1.compare(id3), 0);
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, IdentityAcrossRestarts2) {
+  do {
+    std::string id1;
+    ASSERT_OK(db_->GetDbIdentity(id1));
+
+    Options options = CurrentOptions();
+    options.write_dbid_to_manifest = true;
+    Reopen(options);
+    std::string id2;
+    ASSERT_OK(db_->GetDbIdentity(id2));
+    // id1 should match id2 because identity was not regenerated
+    ASSERT_EQ(id1.compare(id2), 0);
+
     std::string idfilename = IdentityFileName(dbname_);
     ASSERT_OK(env_->DeleteFile(idfilename));
     Reopen(options);
     std::string id3;
     ASSERT_OK(db_->GetDbIdentity(id3));
     // id1 should NOT match id3 because identity was regenerated
-    ASSERT_NE(id1.compare(id3), 0);
+    ASSERT_EQ(id1, id3);
   } while (ChangeCompactOptions());
 }
 
@@ -803,11 +872,12 @@ TEST_F(DBBasicTest, ChecksumTest) {
     ASSERT_OK(Flush());
   }
 
-  // verify data with each type of checksum
-  for (int i = 0; i <= kxxHash64; ++i) {
+  // with each valid checksum type setting...
+  for (int i = 0; i <= max_checksum; ++i) {
     table_options.checksum = static_cast<ChecksumType>(i);
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     Reopen(options);
+    // verify every type of checksum (should be regardless of that setting)
     for (int j = 0; j < (max_checksum + 1) * kNumPerFile; ++j) {
       ASSERT_EQ(Key(j), Get(Key(j)));
     }
@@ -845,30 +915,30 @@ TEST_F(DBBasicTest, MmapAndBufferOptions) {
 
 class TestEnv : public EnvWrapper {
   public:
-    explicit TestEnv() : EnvWrapper(Env::Default()),
-                close_count(0) { }
-
-    class TestLogger : public Logger {
-      public:
-        using Logger::Logv;
-        TestLogger(TestEnv *env_ptr) : Logger() { env = env_ptr; }
-        ~TestLogger() override {
-          if (!closed_) {
-            CloseHelper();
-          }
-        }
-        void Logv(const char* /*format*/, va_list /*ap*/) override{};
-
-       protected:
-        Status CloseImpl() override { return CloseHelper(); }
-
-       private:
-        Status CloseHelper() {
-          env->CloseCountInc();;
-          return Status::IOError();
-        }
-        TestEnv *env;
-    };
+   explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
+
+   class TestLogger : public Logger {
+    public:
+     using Logger::Logv;
+     explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+     ~TestLogger() override {
+       if (!closed_) {
+         CloseHelper();
+       }
+     }
+     void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+    protected:
+     Status CloseImpl() override { return CloseHelper(); }
+
+    private:
+     Status CloseHelper() {
+       env->CloseCountInc();
+       ;
+       return Status::IOError();
+     }
+     TestEnv* env;
+   };
 
     void CloseCountInc() { close_count++; }
 
@@ -890,7 +960,8 @@ TEST_F(DBBasicTest, DBClose) {
   ASSERT_OK(DestroyDB(dbname, options));
 
   DB* db = nullptr;
-  TestEnv* env = new TestEnv();
+  TestEnv* env = new TestEnv(env_);
+  std::unique_ptr<TestEnv> local_env_guard(env);
   options.create_if_missing = true;
   options.env = env;
   Status s = DB::Open(options, dbname, &db);
@@ -924,13 +995,11 @@ TEST_F(DBBasicTest, DBClose) {
   ASSERT_EQ(env->GetCloseCount(), 2);
   options.info_log.reset();
   ASSERT_EQ(env->GetCloseCount(), 3);
-
-  delete options.env;
 }
 
 TEST_F(DBBasicTest, DBCloseFlushError) {
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options options = GetDefaultOptions();
   options.create_if_missing = true;
   options.manual_wal_flush = true;
@@ -950,15 +1019,27 @@ TEST_F(DBBasicTest, DBCloseFlushError) {
   Destroy(options);
 }
 
-TEST_F(DBBasicTest, MultiGetMultiCF) {
+class DBMultiGetTestWithParam : public DBBasicTest,
+                                public testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                          "alyosha", "popovich"},
                         options);
-
-  for (int i = 0; i < 8; ++i) {
-    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
-                  "cf" + std::to_string(i) + "_val"));
+  // <CF, key, value> tuples
+  std::vector<std::tuple<int, std::string, std::string>> cf_kv_vec;
+  static const int num_keys = 24;
+  cf_kv_vec.reserve(num_keys);
+
+  for (int i = 0; i < num_keys; ++i) {
+    int cf = i / 3;
+    int cf_key = 1 % 3;
+    cf_kv_vec.emplace_back(std::make_tuple(
+        cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key),
+        "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)));
+    ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                  std::get<2>(cf_kv_vec[i])));
   }
 
   int get_sv_count = 0;
@@ -968,10 +1049,14 @@ TEST_F(DBBasicTest, MultiGetMultiCF) {
         if (++get_sv_count == 2) {
           // After MultiGet refs a couple of CFs, flush all CFs so MultiGet
           // is forced to repeat the process
-          for (int i = 0; i < 8; ++i) {
-            ASSERT_OK(Flush(i));
-            ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
-                          "cf" + std::to_string(i) + "_val2"));
+          for (int i = 0; i < num_keys; ++i) {
+            int cf = i / 3;
+            int cf_key = i % 8;
+            if (cf_key == 0) {
+              ASSERT_OK(Flush(cf));
+            }
+            ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                          std::get<2>(cf_kv_vec[i]) + "_2"));
           }
         }
         if (get_sv_count == 11) {
@@ -989,26 +1074,53 @@ TEST_F(DBBasicTest, MultiGetMultiCF) {
   std::vector<std::string> keys;
   std::vector<std::string> values;
 
-  for (int i = 0; i < 8; ++i) {
-    cfs.push_back(i);
-    keys.push_back("cf" + std::to_string(i) + "_key");
+  for (int i = 0; i < num_keys; ++i) {
+    cfs.push_back(std::get<0>(cf_kv_vec[i]));
+    keys.push_back(std::get<1>(cf_kv_vec[i]));
   }
 
-  values = MultiGet(cfs, keys, nullptr);
-  ASSERT_EQ(values.size(), 8);
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_EQ(values.size(), num_keys);
   for (unsigned int j = 0; j < values.size(); ++j) {
-    ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val2");
+    ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
   }
-  for (int i = 0; i < 8; ++i) {
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[0]));
+  keys.push_back(std::get<1>(cf_kv_vec[0]));
+  cfs.push_back(std::get<0>(cf_kv_vec[3]));
+  keys.push_back(std::get<1>(cf_kv_vec[3]));
+  cfs.push_back(std::get<0>(cf_kv_vec[4]));
+  keys.push_back(std::get<1>(cf_kv_vec[4]));
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[7]));
+  keys.push_back(std::get<1>(cf_kv_vec[7]));
+  cfs.push_back(std::get<0>(cf_kv_vec[6]));
+  keys.push_back(std::get<1>(cf_kv_vec[6]));
+  cfs.push_back(std::get<0>(cf_kv_vec[1]));
+  keys.push_back(std::get<1>(cf_kv_vec[1]));
+  values = MultiGet(cfs, keys, nullptr, GetParam());
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
+
+  for (int cf = 0; cf < 8; ++cf) {
     auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
-                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
+                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(cf))
                     ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
   }
 }
 
-TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                          "alyosha", "popovich"},
@@ -1054,7 +1166,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
     keys.push_back("cf" + std::to_string(i) + "_key");
   }
 
-  values = MultiGet(cfs, keys, nullptr);
+  values = MultiGet(cfs, keys, nullptr, GetParam());
   ASSERT_TRUE(last_try);
   ASSERT_EQ(values.size(), 8);
   for (unsigned int j = 0; j < values.size(); ++j) {
@@ -1069,7 +1181,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
   }
 }
 
-TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
                          "alyosha", "popovich"},
@@ -1114,7 +1226,7 @@ TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
   }
 
   const Snapshot* snapshot = db_->GetSnapshot();
-  values = MultiGet(cfs, keys, snapshot);
+  values = MultiGet(cfs, keys, snapshot, GetParam());
   db_->ReleaseSnapshot(snapshot);
   ASSERT_EQ(values.size(), 8);
   for (unsigned int j = 0; j < values.size(); ++j) {
@@ -1128,6 +1240,9 @@ TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
   }
 }
 
+INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
+                        testing::Bool());
+
 TEST_F(DBBasicTest, MultiGetBatchedSimpleUnsorted) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
@@ -1284,10 +1399,712 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
     }
   }
 }
+
+// Test class for batched MultiGet with prefix extractor
+// Param bool - If true, use partitioned filters
+//              If false, use full filter block
+class MultiGetPrefixExtractorTest : public DBBasicTest,
+                                    public ::testing::WithParamInterface<bool> {
+};
+
+TEST_P(MultiGetPrefixExtractorTest, Batched) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_prefix_bloom_size_ratio = 10;
+  BlockBasedTableOptions bbto;
+  if (GetParam()) {
+    bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    bbto.partition_filters = true;
+  }
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  bbto.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  SetPerfLevel(kEnableCount);
+  get_perf_context()->Reset();
+
+  // First key is not in the prefix_extractor domain
+  ASSERT_OK(Put("k", "v0"));
+  ASSERT_OK(Put("kk1", "v1"));
+  ASSERT_OK(Put("kk2", "v2"));
+  ASSERT_OK(Put("kk3", "v3"));
+  ASSERT_OK(Put("kk4", "v4"));
+  std::vector<std::string> mem_keys(
+      {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"});
+  std::vector<std::string> inmem_values;
+  inmem_values = MultiGet(mem_keys, nullptr);
+  ASSERT_EQ(inmem_values[0], "v0");
+  ASSERT_EQ(inmem_values[1], "v1");
+  ASSERT_EQ(inmem_values[2], "v2");
+  ASSERT_EQ(inmem_values[3], "v3");
+  ASSERT_EQ(inmem_values[4], "v4");
+  ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2);
+  ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 5);
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> keys({"k", "kk1", "kk2", "kk3", "kk4"});
+  std::vector<std::string> values;
+  get_perf_context()->Reset();
+  values = MultiGet(keys, nullptr);
+  ASSERT_EQ(values[0], "v0");
+  ASSERT_EQ(values[1], "v1");
+  ASSERT_EQ(values[2], "v2");
+  ASSERT_EQ(values[3], "v3");
+  ASSERT_EQ(values[4], "v4");
+  // Filter hits for 4 in-domain keys
+  ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+}
+
+INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest,
+                        ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+class DBMultiGetRowCacheTest : public DBBasicTest,
+                               public ::testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
+  do {
+    option_config_ = kRowCache;
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    Flush(1);
+    ASSERT_OK(Put(1, "k5", "v5"));
+    const Snapshot* snap1 = dbfull()->GetSnapshot();
+    ASSERT_OK(Delete(1, "k4"));
+    Flush(1);
+    const Snapshot* snap2 = dbfull()->GetSnapshot();
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k1"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    bool use_snapshots = GetParam();
+    if (use_snapshots) {
+      ro.snapshot = snap2;
+    }
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1");
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_TRUE(s[2].IsNotFound());
+    ASSERT_OK(s[3]);
+    ASSERT_OK(s[4]);
+
+    // Call MultiGet() again with some intersection with the previous set of
+    // keys. Those should already be in the row cache.
+    keys.assign({"no_key", "k5", "k3", "k2"});
+    for (size_t i = 0; i < keys.size(); ++i) {
+      values[i].Reset();
+      s[i] = Status::OK();
+    }
+    get_perf_context()->Reset();
+
+    if (use_snapshots) {
+      ro.snapshot = snap1;
+    }
+    db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+                  values.data(), s.data(), false);
+
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2");
+    ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_OK(s[3]);
+    if (use_snapshots) {
+      // Only reads from the first SST file would have been cached, since
+      // snapshot seq no is > fd.largest_seqno
+      ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT));
+    } else {
+      ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT));
+    }
+
+    SetPerfLevel(kDisable);
+    dbfull()->ReleaseSnapshot(snap1);
+    dbfull()->ReleaseSnapshot(snap2);
+  } while (ChangeCompactOptions());
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
+                        testing::Values(true, false));
+
+TEST_F(DBBasicTest, GetAllKeyVersions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  const size_t kNumInserts = 4;
+  const size_t kNumDeletes = 4;
+  const size_t kNumUpdates = 4;
+
+  // Check default column family
+  for (size_t i = 0; i != kNumInserts; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i != kNumUpdates; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i != kNumDeletes; ++i) {
+    ASSERT_OK(Delete(std::to_string(i)));
+  }
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(rocksdb::GetAllKeyVersions(db_, Slice(), Slice(),
+                                       std::numeric_limits<size_t>::max(),
+                                       &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+  ASSERT_OK(rocksdb::GetAllKeyVersions(db_, handles_[0], Slice(), Slice(),
+                                       std::numeric_limits<size_t>::max(),
+                                       &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+
+  // Check non-default column family
+  for (size_t i = 0; i != kNumInserts - 1; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i != kNumUpdates - 1; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i != kNumDeletes - 1; ++i) {
+    ASSERT_OK(Delete(1, std::to_string(i)));
+  }
+  ASSERT_OK(rocksdb::GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
+                                       std::numeric_limits<size_t>::max(),
+                                       &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
+  Options options = CurrentOptions();
+  Random rnd(301);
+  BlockBasedTableOptions table_options;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  table_options.block_size = 16 * 1024;
+  assert(table_options.block_size >
+          BlockBasedTable::kMultiGetReadStackBufSize);
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  std::string zero_str(128, '\0');
+  for (int i = 0; i < 100; ++i) {
+    // Make the value compressible. A purely random string doesn't compress
+    // and the resultant data block will not be compressed
+    std::string value(RandomString(&rnd, 128) + zero_str);
+    assert(Put(Key(i), value) == Status::OK());
+  }
+  Flush();
+
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+}
+
+class DBBasicTestWithParallelIO
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> {
+ public:
+  DBBasicTestWithParallelIO() : DBTestBase("/db_basic_test_with_parallel_io") {
+    bool compressed_cache = std::get<0>(GetParam());
+    bool uncompressed_cache = std::get<1>(GetParam());
+    compression_enabled_ = std::get<2>(GetParam());
+    fill_cache_ = std::get<3>(GetParam());
+
+    if (compressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      compressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+    if (uncompressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      uncompressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+
+    env_->count_random_reads_ = true;
+
+    Options options = CurrentOptions();
+    Random rnd(301);
+    BlockBasedTableOptions table_options;
+
+#ifndef ROCKSDB_LITE
+    if (compression_enabled_) {
+      std::vector<CompressionType> compression_types;
+      compression_types = GetSupportedCompressions();
+      // Not every platform may have compression libraries available, so
+      // dynamically pick based on what's available
+      if (compression_types.size() == 0) {
+        compression_enabled_ = false;
+      } else {
+        options.compression = compression_types[0];
+      }
+    }
+#else
+    // GetSupportedCompressions() is not available in LITE build
+    if (!Snappy_Supported()) {
+      compression_enabled_ = false;
+    }
+#endif //ROCKSDB_LITE
+
+    table_options.block_cache = uncompressed_cache_;
+    if (table_options.block_cache == nullptr) {
+      table_options.no_block_cache = true;
+    } else {
+      table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+    }
+    table_options.block_cache_compressed = compressed_cache_;
+    table_options.flush_block_policy_factory.reset(
+        new MyFlushBlockPolicyFactory());
+    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    if (!compression_enabled_) {
+      options.compression = kNoCompression;
+    }
+    Reopen(options);
+
+    std::string zero_str(128, '\0');
+    for (int i = 0; i < 100; ++i) {
+      // Make the value compressible. A purely random string doesn't compress
+      // and the resultant data block will not be compressed
+      values_.emplace_back(RandomString(&rnd, 128) + zero_str);
+      assert(Put(Key(i), values_[i]) == Status::OK());
+    }
+    Flush();
+  }
+
+  bool CheckValue(int i, const std::string& value) {
+    if (values_[i].compare(value) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  int num_lookups() { return uncompressed_cache_->num_lookups(); }
+  int num_found() { return uncompressed_cache_->num_found(); }
+  int num_inserts() { return uncompressed_cache_->num_inserts(); }
+
+  int num_lookups_compressed() { return compressed_cache_->num_lookups(); }
+  int num_found_compressed() { return compressed_cache_->num_found(); }
+  int num_inserts_compressed() { return compressed_cache_->num_inserts(); }
+
+  bool fill_cache() { return fill_cache_; }
+  bool compression_enabled() { return compression_enabled_; }
+  bool has_compressed_cache() { return compressed_cache_ != nullptr; }
+  bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+ private:
+  class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+   public:
+    MyFlushBlockPolicyFactory() {}
+
+    virtual const char* Name() const override {
+      return "MyFlushBlockPolicyFactory";
+    }
+
+    virtual FlushBlockPolicy* NewFlushBlockPolicy(
+        const BlockBasedTableOptions& /*table_options*/,
+        const BlockBuilder& data_block_builder) const override {
+      return new MyFlushBlockPolicy(data_block_builder);
+    }
+  };
+
+  class MyFlushBlockPolicy : public FlushBlockPolicy {
+   public:
+    explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder)
+        : num_keys_(0), data_block_builder_(data_block_builder) {}
+
+    bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+      if (data_block_builder_.empty()) {
+        // First key in this block
+        num_keys_ = 1;
+        return false;
+      }
+      // Flush every 10 keys
+      if (num_keys_ == 10) {
+        num_keys_ = 1;
+        return true;
+      }
+      num_keys_++;
+      return false;
+    }
+
+   private:
+    int num_keys_;
+    const BlockBuilder& data_block_builder_;
+  };
+
+  class MyBlockCache : public Cache {
+   public:
+    explicit MyBlockCache(std::shared_ptr<Cache>& target)
+        : target_(target), num_lookups_(0), num_found_(0), num_inserts_(0) {}
+
+    virtual const char* Name() const override { return "MyBlockCache"; }
+
+    virtual Status Insert(const Slice& key, void* value, size_t charge,
+                          void (*deleter)(const Slice& key, void* value),
+                          Handle** handle = nullptr,
+                          Priority priority = Priority::LOW) override {
+      num_inserts_++;
+      return target_->Insert(key, value, charge, deleter, handle, priority);
+    }
+
+    virtual Handle* Lookup(const Slice& key,
+                           Statistics* stats = nullptr) override {
+      num_lookups_++;
+      Handle* handle = target_->Lookup(key, stats);
+      if (handle != nullptr) {
+        num_found_++;
+      }
+      return handle;
+    }
+
+    virtual bool Ref(Handle* handle) override { return target_->Ref(handle); }
+
+    virtual bool Release(Handle* handle, bool force_erase = false) override {
+      return target_->Release(handle, force_erase);
+    }
+
+    virtual void* Value(Handle* handle) override {
+      return target_->Value(handle);
+    }
+
+    virtual void Erase(const Slice& key) override { target_->Erase(key); }
+    virtual uint64_t NewId() override { return target_->NewId(); }
+
+    virtual void SetCapacity(size_t capacity) override {
+      target_->SetCapacity(capacity);
+    }
+
+    virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+      target_->SetStrictCapacityLimit(strict_capacity_limit);
+    }
+
+    virtual bool HasStrictCapacityLimit() const override {
+      return target_->HasStrictCapacityLimit();
+    }
+
+    virtual size_t GetCapacity() const override {
+      return target_->GetCapacity();
+    }
+
+    virtual size_t GetUsage() const override { return target_->GetUsage(); }
+
+    virtual size_t GetUsage(Handle* handle) const override {
+      return target_->GetUsage(handle);
+    }
+
+    virtual size_t GetPinnedUsage() const override {
+      return target_->GetPinnedUsage();
+    }
+
+    virtual size_t GetCharge(Handle* /*handle*/) const override { return 0; }
+
+    virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                                        bool thread_safe) override {
+      return target_->ApplyToAllCacheEntries(callback, thread_safe);
+    }
+
+    virtual void EraseUnRefEntries() override {
+      return target_->EraseUnRefEntries();
+    }
+
+    int num_lookups() { return num_lookups_; }
+
+    int num_found() { return num_found_; }
+
+    int num_inserts() { return num_inserts_; }
+
+   private:
+    std::shared_ptr<Cache> target_;
+    int num_lookups_;
+    int num_found_;
+    int num_inserts_;
+  };
+
+  std::shared_ptr<MyBlockCache> compressed_cache_;
+  std::shared_ptr<MyBlockCache> uncompressed_cache_;
+  bool compression_enabled_;
+  std::vector<std::string> values_;
+  bool fill_cache_;
+};
+
+TEST_P(DBBasicTestWithParallelIO, MultiGet) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+  int random_reads = env_->random_read_counter_.Read();
+  key_data[0] = Key(1);
+  key_data[1] = Key(51);
+  keys[0] = Slice(key_data[0]);
+  keys[1] = Slice(key_data[1]);
+  values[0].Reset();
+  values[1].Reset();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+  bool read_from_cache = false;
+  if (fill_cache()) {
+    if (has_uncompressed_cache()) {
+      read_from_cache = true;
+    } else if (has_compressed_cache() && compression_enabled()) {
+      read_from_cache = true;
+    }
+  }
+
+  int expected_reads = random_reads + (read_from_cache ? 0 : 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(10);
+  statuses.resize(10);
+  std::vector<int> key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    key_data[i] = Key(key_ints[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
+  }
+  expected_reads += (read_from_cache ? 2 : 4);
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ParallelIO, DBBasicTestWithParallelIO,
+    // Params are as follows -
+    // Param 0 - Compressed cache enabled
+    // Param 1 - Uncompressed cache enabled
+    // Param 2 - Data compression enabled
+    // Param 3 - ReadOptions::fill_cache
+    ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                       ::testing::Bool(), ::testing::Bool()));
+
+class DBBasicTestWithTimestampWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<bool> {
+ public:
+  DBBasicTestWithTimestampWithParam()
+      : DBTestBase("/db_basic_test_with_timestamp") {}
+
+ protected:
+  class TestComparator : public Comparator {
+   private:
+    const Comparator* cmp_without_ts_;
+
+   public:
+    explicit TestComparator(size_t ts_sz)
+        : Comparator(ts_sz), cmp_without_ts_(nullptr) {
+      cmp_without_ts_ = BytewiseComparator();
+    }
+
+    const char* Name() const override { return "TestComparator"; }
+
+    void FindShortSuccessor(std::string*) const override {}
+
+    void FindShortestSeparator(std::string*, const Slice&) const override {}
+
+    int Compare(const Slice& a, const Slice& b) const override {
+      int r = CompareWithoutTimestamp(a, b);
+      if (r != 0 || 0 == timestamp_size()) {
+        return r;
+      }
+      return CompareTimestamp(
+          Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
+          Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
+    }
+
+    int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+      assert(a.size() >= timestamp_size());
+      assert(b.size() >= timestamp_size());
+      Slice k1 = StripTimestampFromUserKey(a, timestamp_size());
+      Slice k2 = StripTimestampFromUserKey(b, timestamp_size());
+
+      return cmp_without_ts_->Compare(k1, k2);
+    }
+
+    int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+      if (!ts1.data() && !ts2.data()) {
+        return 0;
+      } else if (ts1.data() && !ts2.data()) {
+        return 1;
+      } else if (!ts1.data() && ts2.data()) {
+        return -1;
+      }
+      assert(ts1.size() == ts2.size());
+      uint64_t low1 = 0;
+      uint64_t low2 = 0;
+      uint64_t high1 = 0;
+      uint64_t high2 = 0;
+      auto* ptr1 = const_cast<Slice*>(&ts1);
+      auto* ptr2 = const_cast<Slice*>(&ts2);
+      if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
+          !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
+        assert(false);
+      }
+      if (high1 < high2) {
+        return 1;
+      } else if (high1 > high2) {
+        return -1;
+      }
+      if (low1 < low2) {
+        return 1;
+      } else if (low1 > low2) {
+        return -1;
+      }
+      return 0;
+    }
+  };
+
+  Slice EncodeTimestamp(uint64_t low, uint64_t high, std::string* ts) {
+    assert(nullptr != ts);
+    ts->clear();
+    PutFixed64(ts, low);
+    PutFixed64(ts, high);
+    assert(ts->size() == sizeof(low) + sizeof(high));
+    return Slice(*ts);
+  }
+};
+
+TEST_P(DBBasicTestWithTimestampWithParam, PutAndGet) {
+  const int kNumKeysPerFile = 8192;
+  const size_t kNumTimestamps = 6;
+  bool memtable_only = GetParam();
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  std::string tmp;
+  size_t ts_sz = EncodeTimestamp(0, 0, &tmp).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(
+      10 /*bits_per_key*/, false /*use_block_based_builder*/));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_strs(kNumTimestamps);
+  std::vector<std::string> read_ts_strs(kNumTimestamps);
+  std::vector<Slice> write_ts_list;
+  std::vector<Slice> read_ts_list;
+
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.emplace_back(EncodeTimestamp(i * 2, 0, &write_ts_strs[i]));
+    read_ts_list.emplace_back(EncodeTimestamp(1 + i * 2, 0, &read_ts_strs[i]));
+    const Slice& write_ts = write_ts_list.back();
+    WriteOptions wopts;
+    wopts.timestamp = &write_ts;
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+        ASSERT_OK(Put(cf, "key" + std::to_string(j),
+                      "value_" + std::to_string(j) + "_" + std::to_string(i),
+                      wopts));
+      }
+      if (!memtable_only) {
+        ASSERT_OK(Flush(cf));
+      }
+    }
+  }
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      ropts.timestamp = &read_ts_list[i];
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+          std::string value;
+          ASSERT_OK(db_->Get(ropts, cfh, "key" + std::to_string(j), &value));
+          ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                    value);
+        }
+      }
+    }
+  };
+  verify_db_func();
+}
+
+INSTANTIATE_TEST_CASE_P(Timestamp, DBBasicTestWithTimestampWithParam,
+                        ::testing::Bool());
+
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
   rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/db/db_blob_index_test.cc b/db/db_blob_index_test.cc
index 005a23d63b7..30e44e5bac0 100644
--- a/db/db_blob_index_test.cc
+++ b/db/db_blob_index_test.cc
@@ -12,6 +12,7 @@
 #include <utility>
 #include <vector>
 
+#include "db/arena_wrapped_db_iter.h"
 #include "db/column_family.h"
 #include "db/db_iter.h"
 #include "db/db_test_util.h"
@@ -63,9 +64,11 @@ class DBBlobIndexTest : public DBTestBase {
     ReadOptions read_options;
     read_options.snapshot = snapshot;
     PinnableSlice value;
-    auto s = dbfull()->GetImpl(read_options, cfh(), key, &value,
-                               nullptr /*value_found*/, nullptr /*callback*/,
-                               is_blob_index);
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = cfh();
+    get_impl_options.value = &value;
+    get_impl_options.is_blob_index = is_blob_index;
+    auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
     if (s.IsNotFound()) {
       return "NOT_FOUND";
     }
@@ -395,6 +398,29 @@ TEST_F(DBBlobIndexTest, Iterate) {
     verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
            create_blob_iterator, check_is_blob(false));
 
+#ifndef ROCKSDB_LITE
+    // Iterator with blob support and using seek.
+    ASSERT_OK(dbfull()->SetOptions(
+        cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+#endif  // !ROCKSDB_LITE
+
     for (auto* snapshot : snapshots) {
       dbfull()->ReleaseSnapshot(snapshot);
     }
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index f6e1aad323c..89c2dbd5d16 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -19,6 +19,9 @@ class DBBlockCacheTest : public DBTestBase {
   size_t hit_count_ = 0;
   size_t insert_count_ = 0;
   size_t failure_count_ = 0;
+  size_t compression_dict_miss_count_ = 0;
+  size_t compression_dict_hit_count_ = 0;
+  size_t compression_dict_insert_count_ = 0;
   size_t compressed_miss_count_ = 0;
   size_t compressed_hit_count_ = 0;
   size_t compressed_insert_count_ = 0;
@@ -69,6 +72,15 @@ class DBBlockCacheTest : public DBTestBase {
         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
   }
 
+  void RecordCacheCountersForCompressionDict(const Options& options) {
+    compression_dict_miss_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    compression_dict_hit_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    compression_dict_insert_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+  }
+
   void CheckCacheCounters(const Options& options, size_t expected_misses,
                           size_t expected_hits, size_t expected_inserts,
                           size_t expected_failures) {
@@ -87,6 +99,28 @@ class DBBlockCacheTest : public DBTestBase {
     failure_count_ = new_failure_count;
   }
 
+  void CheckCacheCountersForCompressionDict(
+      const Options& options, size_t expected_compression_dict_misses,
+      size_t expected_compression_dict_hits,
+      size_t expected_compression_dict_inserts) {
+    size_t new_compression_dict_miss_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    size_t new_compression_dict_hit_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    size_t new_compression_dict_insert_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+    ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses,
+              new_compression_dict_miss_count);
+    ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits,
+              new_compression_dict_hit_count);
+    ASSERT_EQ(
+        compression_dict_insert_count_ + expected_compression_dict_inserts,
+        new_compression_dict_insert_count);
+    compression_dict_miss_count_ = new_compression_dict_miss_count;
+    compression_dict_hit_count_ = new_compression_dict_hit_count;
+    compression_dict_insert_count_ = new_compression_dict_insert_count;
+  }
+
   void CheckCompressedCacheCounters(const Options& options,
                                     size_t expected_misses,
                                     size_t expected_hits,
@@ -346,8 +380,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
   options.statistics = rocksdb::CreateDBStatistics();
   BlockBasedTableOptions table_options;
   table_options.cache_index_and_filter_blocks = true;
+  LRUCacheOptions co;
   // 500 bytes are enough to hold the first two blocks
-  std::shared_ptr<Cache> cache = NewLRUCache(500, 0, false);
+  co.capacity = 500;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
   table_options.block_cache = cache;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
@@ -365,8 +404,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
   ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
   // set the cache capacity to the current usage
   cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
-  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
-  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
   // Note that the second key needs to be no longer than the first one.
   // Otherwise the second index block may not fit in cache.
   ASSERT_OK(Put(1, "key", "val"));
@@ -377,10 +419,13 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
             index_bytes_insert);
   ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
             filter_bytes_insert);
-  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
-            index_bytes_insert);
-  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
-            filter_bytes_insert);
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
+  //           index_bytes_insert);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
+  //           filter_bytes_insert);
 }
 
 namespace {
@@ -444,11 +489,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) {
               TestGetTickerCount(options, BLOCK_CACHE_ADD));
     ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
     if (priority == Cache::Priority::LOW) {
-      ASSERT_EQ(0, MockCache::high_pri_insert_count);
-      ASSERT_EQ(2, MockCache::low_pri_insert_count);
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(2u, MockCache::low_pri_insert_count);
     } else {
-      ASSERT_EQ(2, MockCache::high_pri_insert_count);
-      ASSERT_EQ(0, MockCache::low_pri_insert_count);
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(0u, MockCache::low_pri_insert_count);
     }
 
     // Access data block.
@@ -462,11 +507,11 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) {
 
     // Data block should be inserted with low priority.
     if (priority == Cache::Priority::LOW) {
-      ASSERT_EQ(0, MockCache::high_pri_insert_count);
-      ASSERT_EQ(3, MockCache::low_pri_insert_count);
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(3u, MockCache::low_pri_insert_count);
     } else {
-      ASSERT_EQ(2, MockCache::high_pri_insert_count);
-      ASSERT_EQ(1, MockCache::low_pri_insert_count);
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(1u, MockCache::low_pri_insert_count);
     }
   }
 }
@@ -665,6 +710,8 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
     options.table_factory.reset(new BlockBasedTableFactory(table_options));
     DestroyAndReopen(options);
 
+    RecordCacheCountersForCompressionDict(options);
+
     for (int i = 0; i < kNumFiles; ++i) {
       ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
       for (int j = 0; j < kNumEntriesPerFile; ++j) {
@@ -677,27 +724,26 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
 
+    // Compression dictionary blocks are preloaded.
+    CheckCacheCountersForCompressionDict(
+        options, kNumFiles /* expected_compression_dict_misses */,
+        0 /* expected_compression_dict_hits */,
+        kNumFiles /* expected_compression_dict_inserts */);
+
     // Seek to a key in a file. It should cause the SST's dictionary meta-block
     // to be read.
     RecordCacheCounters(options);
-    ASSERT_EQ(0,
-              TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
-    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD));
-    ASSERT_EQ(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        0);
+    RecordCacheCountersForCompressionDict(options);
     ReadOptions read_options;
     ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
-    // Two blocks missed/added: dictionary and data block
-    // One block hit: index since it's prefetched
-    CheckCacheCounters(options, 2 /* expected_misses */, 1 /* expected_hits */,
-                       2 /* expected_inserts */, 0 /* expected_failures */);
-    ASSERT_EQ(1,
-              TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
-    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD));
-    ASSERT_GT(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        0);
+    // Two block hits: index and dictionary since they are prefetched
+    // One block missed/added: data block
+    CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */,
+                       1 /* expected_inserts */, 0 /* expected_failures */);
+    CheckCacheCountersForCompressionDict(
+        options, 0 /* expected_compression_dict_misses */,
+        1 /* expected_compression_dict_hits */,
+        0 /* expected_compression_dict_inserts */);
   }
 }
 
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index a2a01d6b4cf..b31c935e85c 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -10,9 +10,20 @@
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
+#include "table/block_based/filter_policy_internal.h"
 
 namespace rocksdb {
 
+namespace {
+using BFP = BloomFilterPolicy;
+
+namespace BFP2 {
+// Extends BFP::Mode with option to use Plain table
+using PseudoMode = int;
+static constexpr PseudoMode kPlainTable = -1;
+}  // namespace BFP2
+}  // namespace
+
 // DB tests related to bloom filter.
 
 class DBBloomFilterTest : public DBTestBase {
@@ -20,12 +31,12 @@ class DBBloomFilterTest : public DBTestBase {
   DBBloomFilterTest() : DBTestBase("/db_bloom_filter_test") {}
 };
 
-class DBBloomFilterTestWithParam
-    : public DBTestBase,
-      public testing::WithParamInterface<std::tuple<bool, bool, uint32_t>> {
+class DBBloomFilterTestWithParam : public DBTestBase,
+                                   public testing::WithParamInterface<
+                                       std::tuple<BFP::Mode, bool, uint32_t>> {
   //                             public testing::WithParamInterface<bool> {
  protected:
-  bool use_block_based_filter_;
+  BFP::Mode bfp_impl_;
   bool partition_filters_;
   uint32_t format_version_;
 
@@ -35,7 +46,7 @@ class DBBloomFilterTestWithParam
   ~DBBloomFilterTestWithParam() override {}
 
   void SetUp() override {
-    use_block_based_filter_ = std::get<0>(GetParam());
+    bfp_impl_ = std::get<0>(GetParam());
     partition_filters_ = std::get<1>(GetParam());
     format_version_ = std::get<2>(GetParam());
   }
@@ -71,8 +82,7 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
     ReadOptions ropts;
     std::string value;
     anon::OptionsOverride options_override;
-    options_override.filter_policy.reset(
-        NewBloomFilterPolicy(20, use_block_based_filter_));
+    options_override.filter_policy.reset(new BFP(20, bfp_impl_));
     options_override.partition_filters = partition_filters_;
     options_override.metadata_block_size = 32;
     Options options = CurrentOptions(options_override);
@@ -432,8 +442,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
     // trigger reset of table_factory
     BlockBasedTableOptions table_options;
     table_options.no_block_cache = true;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_filter_));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl_));
     table_options.partition_filters = partition_filters_;
     if (partition_filters_) {
       table_options.index_type =
@@ -502,24 +511,27 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
 #ifndef ROCKSDB_VALGRIND_RUN
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestDefFormatVersion,
-    ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
-                      std::make_tuple(false, true, test::kDefaultFormatVersion),
-                      std::make_tuple(false, false,
-                                      test::kDefaultFormatVersion)));
+    ::testing::Values(
+        std::make_tuple(BFP::kDeprecatedBlock, false,
+                        test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestWithParam,
-    ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
-                      std::make_tuple(false, true, test::kDefaultFormatVersion),
-                      std::make_tuple(false, false,
-                                      test::kDefaultFormatVersion)));
+    ::testing::Values(
+        std::make_tuple(BFP::kDeprecatedBlock, false,
+                        test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatLatest, DBBloomFilterTestWithParam,
-    ::testing::Values(std::make_tuple(true, false, test::kLatestFormatVersion),
-                      std::make_tuple(false, true, test::kLatestFormatVersion),
-                      std::make_tuple(false, false,
-                                      test::kLatestFormatVersion)));
+    ::testing::Values(
+        std::make_tuple(BFP::kDeprecatedBlock, false,
+                        test::kLatestFormatVersion),
+        std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion),
+        std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
@@ -636,15 +648,17 @@ TEST_F(DBBloomFilterTest, BloomFilterReverseCompatibility) {
 }
 
 namespace {
-// A wrapped bloom over default FilterPolicy
-class WrappedBloom : public FilterPolicy {
+// A wrapped bloom over block-based FilterPolicy
+class TestingWrappedBlockBasedFilterPolicy : public FilterPolicy {
  public:
-  explicit WrappedBloom(int bits_per_key)
-      : filter_(NewBloomFilterPolicy(bits_per_key)), counter_(0) {}
+  explicit TestingWrappedBlockBasedFilterPolicy(int bits_per_key)
+      : filter_(NewBloomFilterPolicy(bits_per_key, true)), counter_(0) {}
 
-  ~WrappedBloom() override { delete filter_; }
+  ~TestingWrappedBlockBasedFilterPolicy() override { delete filter_; }
 
-  const char* Name() const override { return "WrappedRocksDbFilterPolicy"; }
+  const char* Name() const override {
+    return "TestingWrappedBlockBasedFilterPolicy";
+  }
 
   void CreateFilter(const rocksdb::Slice* keys, int n,
                     std::string* dst) const override {
@@ -671,12 +685,13 @@ class WrappedBloom : public FilterPolicy {
 };
 }  // namespace
 
-TEST_F(DBBloomFilterTest, BloomFilterWrapper) {
+TEST_F(DBBloomFilterTest, WrappedBlockBasedFilterPolicy) {
   Options options = CurrentOptions();
   options.statistics = rocksdb::CreateDBStatistics();
 
   BlockBasedTableOptions table_options;
-  WrappedBloom* policy = new WrappedBloom(10);
+  TestingWrappedBlockBasedFilterPolicy* policy =
+      new TestingWrappedBlockBasedFilterPolicy(10);
   table_options.filter_policy.reset(policy);
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -706,6 +721,166 @@ TEST_F(DBBloomFilterTest, BloomFilterWrapper) {
   ASSERT_EQ(2U * maxKey, policy->GetCounter());
 }
 
+namespace {
+// NOTE: This class is referenced by HISTORY.md as a model for a wrapper
+// FilterPolicy selecting among configurations based on context.
+class LevelAndStyleCustomFilterPolicy : public FilterPolicy {
+ public:
+  explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                           int bpk_otherwise)
+      : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)),
+        policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)),
+        policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {}
+
+  // OK to use built-in policy name because we are deferring to a
+  // built-in builder. We aren't changing the serialized format.
+  const char* Name() const override { return policy_fifo_->Name(); }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    if (context.compaction_style == kCompactionStyleFIFO) {
+      return policy_fifo_->GetBuilderWithContext(context);
+    } else if (context.level_at_creation == 0) {
+      return policy_l0_other_->GetBuilderWithContext(context);
+    } else {
+      return policy_otherwise_->GetBuilderWithContext(context);
+    }
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    // OK to defer to any of them; they all can parse built-in filters
+    // from any settings.
+    return policy_fifo_->GetFilterBitsReader(contents);
+  }
+
+  // Defer just in case configuration uses block-based filter
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+    policy_otherwise_->CreateFilter(keys, n, dst);
+  }
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+    return policy_otherwise_->KeyMayMatch(key, filter);
+  }
+
+ private:
+  const std::unique_ptr<const FilterPolicy> policy_fifo_;
+  const std::unique_ptr<const FilterPolicy> policy_l0_other_;
+  const std::unique_ptr<const FilterPolicy> policy_otherwise_;
+};
+
+class TestingContextCustomFilterPolicy
+    : public LevelAndStyleCustomFilterPolicy {
+ public:
+  explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                            int bpk_otherwise)
+      : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) {
+  }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    test_report_ += "cf=";
+    test_report_ += context.column_family_name;
+    test_report_ += ",cs=";
+    test_report_ +=
+        OptionsHelper::compaction_style_to_string[context.compaction_style];
+    test_report_ += ",lv=";
+    test_report_ += std::to_string(context.level_at_creation);
+    test_report_ += "\n";
+
+    return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context);
+  }
+
+  std::string DumpTestReport() {
+    std::string rv;
+    std::swap(rv, test_report_);
+    return rv;
+  }
+
+ private:
+  mutable std::string test_report_;
+};
+}  // namespace
+
+TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
+  for (bool fifo : {true, false}) {
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    options.compaction_style =
+        fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
+
+    BlockBasedTableOptions table_options;
+    auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
+    table_options.filter_policy = policy;
+    table_options.format_version = 5;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey / 2; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
+                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+
+    for (int i = maxKey / 2; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
+                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+
+    // Check that they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    // Since we have two tables / two filters, we might have Bloom checks on
+    // our queries, but no more than one "useful" per query on a found key.
+    EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey);
+
+    // Check that we have two filters, each about
+    // fifo: 0.12% FP rate (15 bits per key)
+    // level: 2.3% FP rate (8 bits per key)
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+    }
+    {
+      auto useful_count =
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+      EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975));
+      EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98));
+    }
+
+    if (!fifo) {  // FIFO only has L0
+      // Full compaction
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                  nullptr));
+      EXPECT_EQ(policy->DumpTestReport(),
+                "cf=bob,cs=kCompactionStyleLevel,lv=1\n");
+
+      // Check that we now have one filter, about 9.2% FP rate (5 bits per key)
+      for (int i = 0; i < maxKey; i++) {
+        ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+      }
+      {
+        auto useful_count =
+            TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+        EXPECT_GE(useful_count, maxKey * 0.90);
+        EXPECT_LE(useful_count, maxKey * 0.91);
+      }
+    }
+
+    // Destroy
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+    handles_[1] = nullptr;
+  }
+}
+
 class SliceTransformLimitedDomain : public SliceTransform {
   const char* Name() const override { return "SliceTransformLimitedDomain"; }
 
@@ -858,33 +1033,32 @@ TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
 #ifndef ROCKSDB_LITE
 class BloomStatsTestWithParam
     : public DBBloomFilterTest,
-      public testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+      public testing::WithParamInterface<std::tuple<BFP2::PseudoMode, bool>> {
  public:
   BloomStatsTestWithParam() {
-    use_block_table_ = std::get<0>(GetParam());
-    use_block_based_builder_ = std::get<1>(GetParam());
-    partition_filters_ = std::get<2>(GetParam());
+    bfp_impl_ = std::get<0>(GetParam());
+    partition_filters_ = std::get<1>(GetParam());
 
     options_.create_if_missing = true;
     options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4));
     options_.memtable_prefix_bloom_size_ratio =
         8.0 * 1024.0 / static_cast<double>(options_.write_buffer_size);
-    if (use_block_table_) {
+    if (bfp_impl_ == BFP2::kPlainTable) {
+      assert(!partition_filters_);  // not supported in plain table
+      PlainTableOptions table_options;
+      options_.table_factory.reset(NewPlainTableFactory(table_options));
+    } else {
       BlockBasedTableOptions table_options;
       table_options.hash_index_allow_collision = false;
       if (partition_filters_) {
-        assert(!use_block_based_builder_);
+        assert(bfp_impl_ != BFP::kDeprecatedBlock);
         table_options.partition_filters = partition_filters_;
         table_options.index_type =
             BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
       }
       table_options.filter_policy.reset(
-          NewBloomFilterPolicy(10, use_block_based_builder_));
+          new BFP(10, static_cast<BFP::Mode>(bfp_impl_)));
       options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
-    } else {
-      assert(!partition_filters_);  // not supported in plain table
-      PlainTableOptions table_options;
-      options_.table_factory.reset(NewPlainTableFactory(table_options));
     }
     options_.env = env_;
 
@@ -901,8 +1075,7 @@ class BloomStatsTestWithParam
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
 
-  bool use_block_table_;
-  bool use_block_based_builder_;
+  BFP2::PseudoMode bfp_impl_;
   bool partition_filters_;
   Options options_;
 };
@@ -1006,7 +1179,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
   ASSERT_EQ(value3, iter->value().ToString());
   // The seek doesn't check block-based bloom filter because last index key
   // starts with the same prefix we're seeking to.
-  uint64_t expected_hits = use_block_based_builder_ ? 1 : 2;
+  uint64_t expected_hits = bfp_impl_ == BFP::kDeprecatedBlock ? 1 : 2;
   ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
 
   iter->Seek(key2);
@@ -1016,12 +1189,14 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
   ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
 }
 
-INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam,
-                        ::testing::Values(std::make_tuple(true, true, false),
-                                          std::make_tuple(true, false, false),
-                                          std::make_tuple(true, false, true),
-                                          std::make_tuple(false, false,
-                                                          false)));
+INSTANTIATE_TEST_CASE_P(
+    BloomStatsTestWithParam, BloomStatsTestWithParam,
+    ::testing::Values(std::make_tuple(BFP::kDeprecatedBlock, false),
+                      std::make_tuple(BFP::kLegacyBloom, false),
+                      std::make_tuple(BFP::kLegacyBloom, true),
+                      std::make_tuple(BFP::kFastLocalBloom, false),
+                      std::make_tuple(BFP::kFastLocalBloom, true),
+                      std::make_tuple(BFP2::kPlainTable, false)));
 
 namespace {
 void PrefixScanInit(DBBloomFilterTest* dbtest) {
@@ -1095,6 +1270,8 @@ TEST_F(DBBloomFilterTest, PrefixScan) {
     options.max_background_compactions = 2;
     options.create_if_missing = true;
     options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+    assert(!options.unordered_write);
+    // It is incompatible with allow_concurrent_memtable_write=false
     options.allow_concurrent_memtable_write = false;
 
     BlockBasedTableOptions table_options;
@@ -1145,6 +1322,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   options.optimize_filters_for_hits = true;
   options.statistics = rocksdb::CreateDBStatistics();
+  get_perf_context()->Reset();
   get_perf_context()->EnablePerLevelPerfContext();
   CreateAndReopenWithCF({"mypikachu"}, options);
 
@@ -1326,8 +1504,8 @@ int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
 // into the same string, or 2) the transformed seek key is of the same length
 // as the upper bound and two keys are adjacent according to the comparator.
 TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
-  int iteration = 0;
-  for (bool use_block_based_builder : {true, false}) {
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
+    int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
     Options options;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewCappedPrefixTransform(4));
@@ -1336,8 +1514,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
     // Enable prefix bloom for SST files
     BlockBasedTableOptions table_options;
     table_options.cache_index_and_filter_blocks = true;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
     table_options.index_shortening = BlockBasedTableOptions::
         IndexShorteningMode::kShortenSeparatorsAndSuccessor;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -1380,7 +1557,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
       // should check bloom filter since upper bound meets requirement
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration);
+                2 + using_full_builder);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     {
@@ -1394,7 +1571,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
       // should skip bloom filter since upper bound is too long
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration);
+                2 + using_full_builder);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     {
@@ -1408,7 +1585,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       // should check bloom filter since upper bound matches transformed seek
       // key
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration * 2);
+                2 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     {
@@ -1422,7 +1599,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
       // should skip bloom filter since mismatch is found
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration * 2);
+                2 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
@@ -1436,7 +1613,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
       ASSERT_EQ(CountIter(iter, "abc"), 4);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration * 2);
+                2 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}}));
@@ -1449,18 +1626,17 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
       std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
       ASSERT_EQ(CountIter(iter, "abc"), 0);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                3 + iteration * 2);
+                3 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
     }
-    iteration++;
   }
 }
 
 // Create multiple SST files each with a different prefix_extractor config,
 // verify iterators can read all SST files using the latest config.
 TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
-  int iteration = 0;
-  for (bool use_block_based_builder : {true, false}) {
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
+    int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
     Options options;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@@ -1468,8 +1644,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
     options.statistics = CreateDBStatistics();
     // Enable prefix bloom for SST files
     BlockBasedTableOptions table_options;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
     table_options.cache_index_and_filter_blocks = true;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     DestroyAndReopen(options);
@@ -1495,10 +1670,10 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
     std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
     ASSERT_EQ(CountIter(iter, "foo"), 2);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-              1 + iteration);
+              1 + using_full_builder);
     ASSERT_EQ(CountIter(iter, "gpk"), 0);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-              1 + iteration);
+              1 + using_full_builder);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
 
     // second SST with capped:3 BF
@@ -1512,13 +1687,13 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
       std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
       ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                2 + iteration * 2);
+                2 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
       ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
       // both counters are incremented because BF is "not changed" for 1 of the
       // 2 SST files, so filter is checked once and found no match.
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                3 + iteration * 2);
+                3 + using_full_builder * 2);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
     }
 
@@ -1537,24 +1712,24 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
       ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
       // the first and last BF are checked
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                4 + iteration * 3);
+                4 + using_full_builder * 3);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
       ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
       // only last BF is checked and not found
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                5 + iteration * 3);
+                5 + using_full_builder * 3);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
     }
 
     // iter_old can only see the first SST, so checked plus 1
     ASSERT_EQ(CountIter(iter_old, "foo"), 4);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-              6 + iteration * 3);
+              6 + using_full_builder * 3);
     // iter was created after the first setoptions call so only full filter
     // will check the filter
     ASSERT_EQ(CountIter(iter, "foo"), 2);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-              6 + iteration * 4);
+              6 + using_full_builder * 4);
 
     {
       // keys in all three SSTs are visible to iterator
@@ -1563,11 +1738,11 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
       std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
       ASSERT_EQ(CountIter(iter_all, "foo"), 9);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                7 + iteration * 5);
+                7 + using_full_builder * 5);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
       ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                8 + iteration * 5);
+                8 + using_full_builder * 5);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
     }
     ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
@@ -1579,15 +1754,14 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
       // all three SST are checked because the current options has the same as
       // the remaining SST (capped:3)
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                9 + iteration * 7);
+                9 + using_full_builder * 7);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
       ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
-                10 + iteration * 7);
+                10 + using_full_builder * 7);
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
     }
     // TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
-    iteration++;
   }
 }
 
@@ -1596,7 +1770,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
 // as expected
 TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
   int iteration = 0;
-  for (bool use_block_based_builder : {true, false}) {
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
     Options options = CurrentOptions();
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@@ -1605,8 +1779,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
     // Enable prefix bloom for SST files
     BlockBasedTableOptions table_options;
     table_options.cache_index_and_filter_blocks = true;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
     ReadOptions read_options;
@@ -1655,8 +1828,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
 // Verify it's possible to change prefix_extractor at runtime and iterators
 // behaves as expected
 TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
-  int iteration = 0;
-  for (bool use_block_based_builder : {true, false}) {
+  for (auto bfp_impl : BFP::kAllFixedImpls) {
     Options options;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
@@ -1665,8 +1837,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
     // Enable prefix bloom for SST files
     BlockBasedTableOptions table_options;
     table_options.cache_index_and_filter_blocks = true;
-    table_options.filter_policy.reset(
-        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.filter_policy.reset(new BFP(10, bfp_impl));
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     DestroyAndReopen(options);
 
@@ -1717,7 +1888,6 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
     ASSERT_EQ(CountIter(iter_old, "abc"), 0);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
-    iteration++;
   }
 }
 
diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc
index 37e80048e6d..90fa1e88337 100644
--- a/db/db_compaction_filter_test.cc
+++ b/db/db_compaction_filter_test.cc
@@ -720,7 +720,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
   cfilter_count = 0;
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   // The filter should delete 40 records.
-  ASSERT_EQ(40U, cfilter_count);
+  ASSERT_EQ(40, cfilter_count);
 
   {
     // Scan the entire database as of the snapshot to ensure
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index b5033b66f0c..84f9f55dd7b 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -12,10 +12,11 @@
 #include "port/stack_trace.h"
 #include "rocksdb/concurrent_task_limiter.h"
 #include "rocksdb/experimental.h"
+#include "rocksdb/sst_file_writer.h"
 #include "rocksdb/utilities/convenience.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
 #include "util/concurrent_task_limiter_impl.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -141,7 +142,7 @@ Options DeletionTriggerOptions(Options options) {
   options.compression = kNoCompression;
   options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 0;
+  options.max_write_buffer_size_to_maintain = 0;
   options.num_levels = kCDTNumLevels;
   options.level0_file_num_compaction_trigger = 1;
   options.target_file_size_base = options.write_buffer_size * 2;
@@ -497,14 +498,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
 
   // Create new iterator for:
   // (1) 1 for verifying flush results
-  // (2) 3 for compaction input files
-  // (3) 1 for verifying compaction results.
-  ASSERT_EQ(num_new_table_reader, 5);
+  // (2) 1 for verifying compaction results.
+  // (3) New TableReaders will not be created for compaction inputs
+  ASSERT_EQ(num_new_table_reader, 2);
 
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
   ASSERT_EQ(Key(1), Get(Key(1)));
-  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5);
   ASSERT_EQ(num_new_table_reader, 0);
 
   num_table_cache_lookup = 0;
@@ -519,13 +520,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   // May preload table cache too.
   ASSERT_GE(num_table_cache_lookup, 1);
   old_num_table_cache_lookup2 = num_table_cache_lookup;
-  // One for compaction input, one for verifying compaction results.
-  ASSERT_EQ(num_new_table_reader, 2);
+  // One for verifying compaction results.
+  // No new iterator created for compaction.
+  ASSERT_EQ(num_new_table_reader, 1);
 
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
   ASSERT_EQ(Key(1), Get(Key(1)));
-  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 2);
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
   ASSERT_EQ(num_new_table_reader, 0);
 
   rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -731,7 +733,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
 
   // Now all column families qualify compaction but only one should be
   // scheduled, because no column family hits speed up condition.
-  ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
 
   // Create two more files for one column family, which triggers speed up
   // condition, three compactions will be scheduled.
@@ -745,7 +747,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
     ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
               NumTableFilesAtLevel(0, 2));
   }
-  ASSERT_EQ(3, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
 
   // Unblock all threads to unblock all compactions.
   for (size_t i = 0; i < kTotalTasks; i++) {
@@ -776,7 +778,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
 
   // Now all column families qualify compaction but only one should be
   // scheduled, because no column family hits speed up condition.
-  ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
 
   for (size_t i = 0; i < kTotalTasks; i++) {
     sleeping_tasks[i].WakeUp();
@@ -3520,76 +3522,238 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
-  const int kNumKeysPerFile = 32;
-  const int kNumLevelFiles = 2;
+TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
   const int kValueSize = 100;
 
-  Options options = CurrentOptions();
-  options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
-  options.max_open_files = -1;       // needed for ttl compaction
-  env_->time_elapse_only_sleep_ = false;
-  options.env = env_;
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.compression = kNoCompression;
+      options.ttl = 24 * 60 * 60;  // 24 hours
+      if (if_open_all_files) {
+        options.max_open_files = -1;
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 2;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the oldest ancester time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
+
+      env_->time_elapse_only_sleep_ = false;
+      options.env = env_;
+
+      env_->addon_time_.store(0);
+      DestroyAndReopen(options);
 
-  env_->addon_time_.store(0);
-  DestroyAndReopen(options);
+      int ttl_compactions = 0;
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kTtl) {
+              ttl_compactions++;
+            }
+          });
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+      // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
+      Random rnd(301);
+      for (int i = 1; i <= 100; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      for (int i = 101; i <= 200; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      MoveFilesToLevel(6);
+      ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
 
-  int periodic_compactions = 0;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
-        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
-        auto compaction_reason = compaction->compaction_reason();
-        if (compaction_reason == CompactionReason::kPeriodicCompaction) {
-          periodic_compactions++;
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+      // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
+      for (int i = 1; i <= 50; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      for (int i = 51; i <= 150; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      MoveFilesToLevel(4);
+      ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
 
-  Random rnd(301);
-  for (int i = 0; i < kNumLevelFiles; ++i) {
-    for (int j = 0; j < kNumKeysPerFile; ++j) {
-      ASSERT_OK(
-          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+      // Add one L1 file with key range: [26, 75].
+      for (int i = 26; i <= 75; ++i) {
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
+
+      // LSM tree:
+      // L1:         [26 .. 75]
+      // L4:     [1 .. 50][51 ..... 150]
+      // L6:     [1 ........ 100][101 .... 200]
+      //
+      // On TTL expiry, TTL compaction should be initiated on L1 file, and the
+      // compactions should keep going on until the key range hits bottom level.
+      // In other words: the compaction on this data range "cascasdes" until
+      // reaching the bottom level.
+      //
+      // Order of events on TTL expiry:
+      // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the
+      // ttl
+      //    compaction.
+      // 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
+      // 3. The new output file from L4 falls to L5 via 1 trival move initiated
+      //    by the ttl compaction.
+      // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
+
+      // Add 25 hours and do a write
+      env_->addon_time_.fetch_add(25 * 60 * 60);
+
+      ASSERT_OK(Put(Key(1), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_EQ(5, ttl_compactions);
+
+      env_->addon_time_.fetch_add(25 * 60 * 60);
+      ASSERT_OK(Put(Key(2), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_GE(ttl_compactions, 6);
+
+      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
     }
-    Flush();
   }
-  dbfull()->TEST_WaitForCompact();
+}
 
-  ASSERT_EQ("2", FilesPerLevel());
-  ASSERT_EQ(0, periodic_compactions);
+TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
 
-  // Add 50 hours and do a write
-  env_->addon_time_.fetch_add(50 * 60 * 60);
-  ASSERT_OK(Put("a", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  // Assert that the files stay in the same level
-  ASSERT_EQ("3", FilesPerLevel());
-  // The two old files go through the periodic compaction process
-  ASSERT_EQ(2, periodic_compactions);
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+      if (if_open_all_files) {
+        options.max_open_files = -1;  // needed for ttl compaction
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 0;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the file creation time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
+
+      env_->time_elapse_only_sleep_ = false;
+      options.env = env_;
+
+      env_->addon_time_.store(0);
+      DestroyAndReopen(options);
 
-  MoveFilesToLevel(1);
-  ASSERT_EQ("0,3", FilesPerLevel());
+      int periodic_compactions = 0;
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+              periodic_compactions++;
+            }
+          });
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+      Random rnd(301);
+      for (int i = 0; i < kNumLevelFiles; ++i) {
+        for (int j = 0; j < kNumKeysPerFile; ++j) {
+          ASSERT_OK(Put(Key(i * kNumKeysPerFile + j),
+                        RandomString(&rnd, kValueSize)));
+        }
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
 
-  // Add another 50 hours and do another write
-  env_->addon_time_.fetch_add(50 * 60 * 60);
-  ASSERT_OK(Put("b", "2"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ("1,3", FilesPerLevel());
-  // The three old files now go through the periodic compaction process. 2 + 3.
-  ASSERT_EQ(5, periodic_compactions);
+      ASSERT_EQ("2", FilesPerLevel());
+      ASSERT_EQ(0, periodic_compactions);
 
-  // Add another 50 hours and do another write
-  env_->addon_time_.fetch_add(50 * 60 * 60);
-  ASSERT_OK(Put("c", "3"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ("2,3", FilesPerLevel());
-  // The four old files now go through the periodic compaction process. 5 + 4.
-  ASSERT_EQ(9, periodic_compactions);
+      // Add 50 hours and do a write
+      env_->addon_time_.fetch_add(50 * 60 * 60);
+      ASSERT_OK(Put("a", "1"));
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      // Assert that the files stay in the same level
+      ASSERT_EQ("3", FilesPerLevel());
+      // The two old files go through the periodic compaction process
+      ASSERT_EQ(2, periodic_compactions);
+
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,3", FilesPerLevel());
+
+      // Add another 50 hours and do another write
+      env_->addon_time_.fetch_add(50 * 60 * 60);
+      ASSERT_OK(Put("b", "2"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        Flush();
+      }
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("1,3", FilesPerLevel());
+      // The three old files now go through the periodic compaction process. 2
+      // + 3.
+      ASSERT_EQ(5, periodic_compactions);
+
+      // Add another 50 hours and do another write
+      env_->addon_time_.fetch_add(50 * 60 * 60);
+      ASSERT_OK(Put("c", "3"));
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ("2,3", FilesPerLevel());
+      // The four old files now go through the periodic compaction process. 5
+      // + 4.
+      ASSERT_EQ(9, periodic_compactions);
 
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    }
+  }
 }
 
 TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
@@ -3602,7 +3766,6 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
   const int kValueSize = 100;
 
   Options options = CurrentOptions();
-  options.max_open_files = -1;  // needed for ttl compaction
   env_->time_elapse_only_sleep_ = false;
   options.env = env_;
 
@@ -3747,6 +3910,91 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
+  class TestCompactionFilter : public CompactionFilter {
+    const char* Name() const override { return "TestCompactionFilter"; }
+  };
+  class TestCompactionFilterFactory : public CompactionFilterFactory {
+    const char* Name() const override { return "TestCompactionFilterFactory"; }
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& /*context*/) override {
+      return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+    }
+  };
+
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Random rnd(301);
+
+  Options options = CurrentOptions();
+  TestCompactionFilter test_compaction_filter;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+  env_->addon_time_.store(0);
+
+  enum CompactionFilterType {
+    kUseCompactionFilter,
+    kUseCompactionFilterFactory
+  };
+
+  for (CompactionFilterType comp_filter_type :
+       {kUseCompactionFilter, kUseCompactionFilterFactory}) {
+    // Assert that periodic compactions are not enabled.
+    ASSERT_EQ(port::kMaxUint64 - 1, options.periodic_compaction_seconds);
+
+    if (comp_filter_type == kUseCompactionFilter) {
+      options.compaction_filter = &test_compaction_filter;
+      options.compaction_filter_factory.reset();
+    } else if (comp_filter_type == kUseCompactionFilterFactory) {
+      options.compaction_filter = nullptr;
+      options.compaction_filter_factory.reset(
+          new TestCompactionFilterFactory());
+    }
+    DestroyAndReopen(options);
+
+    // periodic_compaction_seconds should be set to the sanitized value when
+    // a compaction filter or a compaction filter factory is used.
+    ASSERT_EQ(30 * 24 * 60 * 60,
+              dbfull()->GetOptions().periodic_compaction_seconds);
+
+    int periodic_compactions = 0;
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+          Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+          auto compaction_reason = compaction->compaction_reason();
+          if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+            periodic_compactions++;
+          }
+        });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    for (int i = 0; i < kNumLevelFiles; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(
+            Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+      }
+      Flush();
+    }
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("2", FilesPerLevel());
+    ASSERT_EQ(0, periodic_compactions);
+
+    // Add 31 days and do a write
+    env_->addon_time_.fetch_add(31 * 24 * 60 * 60);
+    ASSERT_OK(Put("a", "1"));
+    Flush();
+    dbfull()->TEST_WaitForCompact();
+    // Assert that the files stay in the same level
+    ASSERT_EQ("3", FilesPerLevel());
+    // The two old files go through the periodic compaction process
+    ASSERT_EQ(2, periodic_compactions);
+
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
 
 TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
   // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
@@ -3890,11 +4138,17 @@ TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
       }
       Flush(1);
     }
-    auto manual_compaction_thread = port::Thread([this]() {
+    auto manual_compaction_thread = port::Thread([this, i]() {
       CompactRangeOptions cro;
       cro.allow_write_stall = false;
-      ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
-                      .IsShutdownInProgress());
+      Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr);
+      if (i == 0) {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsColumnFamilyDropped());
+      } else {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsShutdownInProgress());
+      }
     });
 
     TEST_SYNC_POINT(
@@ -4158,7 +4412,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
 
   const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5",
     "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
-  const int cf_count = sizeof cf_names / sizeof cf_names[0];
+  const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0];
 
   std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
 
@@ -4177,7 +4431,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
   std::vector<Options> option_vector;
   option_vector.reserve(cf_count);
 
-  for (int cf = 0; cf < cf_count; cf++) {
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
     ColumnFamilyOptions cf_opt(options);
     if (cf == 0) {
       // "Default" CF does't use compaction limiter
@@ -4195,7 +4449,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
     option_vector.emplace_back(DBOptions(options), cf_opt);
   }
 
-  for (int cf = 1; cf < cf_count; cf++) {
+  for (unsigned int cf = 1; cf < cf_count; cf++) {
     CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
   }
 
@@ -4247,7 +4501,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
   int keyIndex = 0;
 
   for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
-    for (int cf = 0; cf < cf_count; cf++) {
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
       for (int i = 0; i < kNumKeysPerFile; i++) {
         ASSERT_OK(Put(cf, Key(keyIndex++), ""));
       }
@@ -4255,13 +4509,13 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
       ASSERT_OK(Put(cf, "", ""));
     }
 
-    for (int cf = 0; cf < cf_count; cf++) {
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
       dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
     }
   }
 
   // Enough L0 files to trigger compaction
-  for (int cf = 0; cf < cf_count; cf++) {
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
     ASSERT_EQ(NumTableFilesAtLevel(0, cf),
       options.level0_file_num_compaction_trigger);
   }
@@ -4288,7 +4542,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
     sleeping_compact_tasks[i].WaitUntilDone();
   }
 
-  for (int cf = 0; cf < cf_count; cf++) {
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
     dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
   }
 
@@ -4333,12 +4587,6 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) {
   options.env = new MockEnv(Env::Default());
   Reopen(options);
   bool readahead = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "TableCache::NewIterator:for_compaction", [&](void* arg) {
-        bool* use_direct_reads = static_cast<bool*>(arg);
-        ASSERT_EQ(*use_direct_reads,
-                  options.use_direct_reads);
-      });
   SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
         bool* use_direct_writes = static_cast<bool*>(arg);
@@ -4551,6 +4799,36 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
   ASSERT_EQ(num, 0);
 }
 
+TEST_F(DBCompactionTest, CompactionDuringShutdown) {
+  Options opts = CurrentOptions();
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  InternalStats* internal_stats_ptr = cfd->internal_stats();
+  ASSERT_NE(internal_stats_ptr, nullptr);
+
+  Random rnd(301);
+  for (auto i = 0; i < 2; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), RandomString(&rnd, 1024)));
+    }
+    Flush();
+  }
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+      [&](void* /*arg*/) {
+    dbfull()->shutting_down_.store(true);
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->error_handler_.GetBGError());
+}
+
 // FixFileIngestionCompactionDeadlock tests and verifies that compaction and
 // file ingestion do not cause deadlock in the event of write stall triggered
 // by number of L0 files reaching level0_stop_writes_trigger.
@@ -4628,6 +4906,201 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
   Close();
 }
 
+TEST_F(DBCompactionTest, ConsistencyFailTest) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistency", [&](void* arg) {
+        auto p =
+            reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+        // just swap the two FileMetaData so that we hit error
+        // in CheckConsistency funcion
+        FileMetaData* temp = *(p->first);
+        *(p->first) = *(p->second);
+        *(p->second) = temp;
+      });
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int k = 0; k < 2; ++k) {
+    ASSERT_OK(Put("foo", "bar"));
+    Flush();
+  }
+
+  ASSERT_NOK(Put("foo", "bar"));
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+void IngestOneKeyValue(DBImpl* db, const std::string& key,
+                       const std::string& value, const Options& options) {
+  ExternalSstFileInfo info;
+  std::string f = test::PerThreadDBPath("sst_file" + key);
+  EnvOptions env;
+  rocksdb::SstFileWriter writer(env, options);
+  auto s = writer.Open(f);
+  ASSERT_OK(s);
+  // ASSERT_OK(writer.Put(Key(), ""));
+  ASSERT_OK(writer.Put(key, value));
+
+  ASSERT_OK(writer.Finish(&info));
+  IngestExternalFileOptions ingest_opt;
+
+  ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt));
+}
+
+TEST_P(DBCompactionTestWithParam,
+       FlushAfterIntraL0CompactionCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::atomic<int> pick_intra_l0_count(0);
+  std::string value(RandomString(&rnd, kValueSize));
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompactionTestWithParam::FlushAfterIntraL0:1",
+        "CompactionJob::Run():Start"}});
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Flush 5 L0 sst.
+  for (int i = 0; i < 5; ++i) {
+    ASSERT_OK(Put(Key(i + 1), value));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+  // Put one key, to make smallest log sequence number in this memtable is less
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(0), "a"));
+
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+  // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction.
+  for (int i = 5; i < 10; i++) {
+    IngestOneKeyValue(dbfull(), Key(i), value, options);
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+
+  TEST_SYNC_POINT("DBCompactionTestWithParam::FlushAfterIntraL0:1");
+  // Put one key, to make biggest log sequence number in this memtable is bigger
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(2), "b"));
+  ASSERT_EQ(10, NumTableFilesAtLevel(0));
+  dbfull()->TEST_WaitForCompact();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GT(level_to_files[0].size(), 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
+
+  ASSERT_OK(Flush());
+}
+
+TEST_P(DBCompactionTestWithParam,
+       IntraL0CompactionAfterFlushCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  options.write_buffer_size = 2 << 20;
+  options.max_write_buffer_number = 6;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(RandomString(&rnd, kValueSize));
+  std::string value2(RandomString(&rnd, kValueSize));
+  std::string bigvalue = value + value;
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  std::atomic<int> pick_intra_l0_count(0);
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1",
+        "CompactionJob::Run():Start"}});
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  // Make 6 L0 sst.
+  for (int i = 0; i < 6; ++i) {
+    if (i % 2 == 0) {
+      IngestOneKeyValue(dbfull(), Key(i), value, options);
+    } else {
+      ASSERT_OK(Put(Key(i), value));
+      ASSERT_OK(Flush());
+    }
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+
+  // Stop run flush job
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  test::SleepingBackgroundTask sleeping_tasks;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks,
+                 Env::Priority::HIGH);
+  sleeping_tasks.WaitUntilSleeping();
+
+  // Put many keys to make memtable request to flush
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_OK(Put(Key(i), bigvalue));
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+  // ingest file to trigger IntraL0Compaction
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(i, NumTableFilesAtLevel(0));
+    IngestOneKeyValue(dbfull(), Key(i), value2, options);
+  }
+  ASSERT_EQ(10, NumTableFilesAtLevel(0));
+
+  // Wake up flush job
+  sleeping_tasks.WakeUp();
+  sleeping_tasks.WaitUntilDone();
+  TEST_SYNC_POINT("DBCompactionTestWithParam::IntraL0CompactionAfterFlush:1");
+  dbfull()->TEST_WaitForCompact();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  uint64_t error_count = 0;
+  db_->GetIntProperty("rocksdb.background-errors", &error_count);
+  ASSERT_EQ(error_count, 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(bigvalue, Get(Key(i)));
+  }
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(value2, Get(Key(i)));
+  }
+}
+
 #endif // !defined(ROCKSDB_LITE)
 }  // namespace rocksdb
 
diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc
index 46ba411b6fd..bc72744656e 100644
--- a/db/db_encryption_test.cc
+++ b/db/db_encryption_test.cc
@@ -7,7 +7,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
 #if !defined(ROCKSDB_LITE)
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #endif
 #include <iostream>
 #include <string>
@@ -83,6 +83,34 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
   }
 }
 
+TEST_F(DBEncryptionTest, ReadEmptyFile) {
+  auto defaultEnv = Env::Default();
+
+  // create empty file for reading it back in later
+  auto envOptions = EnvOptions(CurrentOptions());
+  auto filePath = dbname_ + "/empty.empty";
+
+  Status status;
+  {
+    std::unique_ptr<WritableFile> writableFile;
+    status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions);
+    ASSERT_OK(status);
+  }
+
+  std::unique_ptr<SequentialFile> seqFile;
+  status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+  ASSERT_OK(status);
+
+  std::string scratch;
+  Slice data;
+  // reading back 16 bytes from the empty file shouldn't trigger an assertion.
+  // it should just work and return an empty string
+  status = seqFile->Read(16, &data, (char*)scratch.data());
+  ASSERT_OK(status);
+
+  ASSERT_TRUE(data.empty());
+}
+
 #endif // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index ace0befb6d5..dd5f8f67f09 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -6,24 +6,20 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <stdint.h>
 #include <algorithm>
+#include <cinttypes>
 #include <string>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/job_context.h"
 #include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "util/file_util.h"
-#include "util/filename.h"
+#include "test_util/sync_point.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -61,7 +57,9 @@ Status DBImpl::EnableFileDeletions(bool force) {
   }
   if (file_deletion_enabled) {
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
-    PurgeObsoleteFiles(job_context);
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "File Deletions Enable, but not really enabled. Counter: %d",
@@ -165,6 +163,15 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
   return wal_manager_.GetSortedWalFiles(files);
 }
 
+Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
+  uint64_t current_logfile_number;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    current_logfile_number = logfile_number_;
+  }
+
+  return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
+}
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index 09c461f8da4..08a1d8d1be1 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -7,10 +7,16 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <atomic>
+
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
+#include "port/port.h"
 #include "port/stack_trace.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
 
 namespace rocksdb {
 
@@ -290,6 +296,129 @@ TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
   Close();
 }
 
+TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:AfterScheduleFlush",
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
+  class TestListener : public EventListener {
+   public:
+    void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+      // There's only one key in each flush.
+      ASSERT_EQ(info.smallest_seqno, info.largest_seqno);
+      ASSERT_NE(0, info.smallest_seqno);
+      if (info.smallest_seqno == seq1) {
+        // First flush completed
+        ASSERT_FALSE(completed1);
+        completed1 = true;
+        CheckFlushResultCommitted(db, seq1);
+      } else {
+        // Second flush completed
+        ASSERT_FALSE(completed2);
+        completed2 = true;
+        ASSERT_EQ(info.smallest_seqno, seq2);
+        CheckFlushResultCommitted(db, seq2);
+      }
+    }
+
+    void CheckFlushResultCommitted(DB* db, SequenceNumber seq) {
+      DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
+      InstrumentedMutex* mutex = db_impl->mutex();
+      mutex->Lock();
+      auto* cfd =
+          reinterpret_cast<ColumnFamilyHandleImpl*>(db->DefaultColumnFamily())
+              ->cfd();
+      ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber());
+      mutex->Unlock();
+    }
+
+    std::atomic<SequenceNumber> seq1{0};
+    std::atomic<SequenceNumber> seq2{0};
+    std::atomic<bool> completed1{false};
+    std::atomic<bool> completed2{false};
+  };
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:start",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"},
+       {"DBImpl::FlushMemTableToOutputFile:Finish",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table", [&listener](void* arg) {
+        // Wait for the second flush finished, out of mutex.
+        auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+        if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) {
+          TEST_SYNC_POINT(
+              "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:"
+              "WaitSecond");
+        }
+      });
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  // Setting max_flush_jobs = max_background_jobs / 4 = 2.
+  options.max_background_jobs = 8;
+  // Allow 2 immutable memtables.
+  options.max_write_buffer_number = 3;
+  Reopen(options);
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("foo", "v"));
+  listener->seq1 = db_->GetLatestSequenceNumber();
+  // t1 will wait for the second flush complete before committing flush result.
+  auto t1 = port::Thread([&]() {
+    // flush_opts.wait = true
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  });
+  // Wait for first flush started.
+  TEST_SYNC_POINT(
+      "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst");
+  // The second flush will exit early without commit its result. The work
+  // is delegated to the first flush.
+  ASSERT_OK(Put("bar", "v"));
+  listener->seq2 = db_->GetLatestSequenceNumber();
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+  t1.join();
+  ASSERT_TRUE(listener->completed1);
+  ASSERT_TRUE(listener->completed2);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
 TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -431,7 +560,7 @@ TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
     cf_ids.push_back(cf_id);
   }
   ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
-  ASSERT_TRUE(Flush(cf_ids).IsShutdownInProgress());
+  ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped());
   Destroy(options);
 }
 
@@ -514,6 +643,80 @@ TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) {
   ASSERT_EQ("value", Get(0, "key"));
 }
 
+TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) {
+  bool atomic_flush = GetParam();
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.max_write_buffer_number = 4;
+  // Set min_write_buffer_number_to_merge to be greater than 1, so that
+  // a column family with one memtable in the imm will not cause IsFlushPending
+  // to return true when flush_requested_ is false.
+  options.min_write_buffer_number_to_merge = 2;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+  ASSERT_OK(Put(0, "key00", "value00"));
+  ASSERT_OK(Put(1, "key10", "value10"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  ASSERT_OK(Put(0, "key01", "value01"));
+  // Since max_write_buffer_number is 4, the following flush won't cause write
+  // stall.
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+  handles_[1] = nullptr;
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  delete handles_[0];
+  handles_.clear();
+}
+
+TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(0, "key", "value"));
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd_default =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+          ->cfd();
+  auto* cfd_pikachu = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    delete handles_[1];
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu},
+                                                flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
                         testing::Bool());
 
diff --git a/db/db_impl.cc b/db/db_impl/db_impl.cc
similarity index 78%
rename from db/db_impl.cc
rename to db/db_impl/db_impl.cc
index c6268d0cb80..ee73cc3fd75 100644
--- a/db/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -6,17 +6,15 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
 #include <stdint.h>
 #ifdef OS_SOLARIS
 #include <alloca.h>
 #endif
 
 #include <algorithm>
+#include <cinttypes>
 #include <cstdio>
 #include <map>
 #include <set>
@@ -27,8 +25,9 @@
 #include <utility>
 #include <vector>
 
+#include "db/arena_wrapped_db_iter.h"
 #include "db/builder.h"
-#include "db/compaction_job.h"
+#include "db/compaction/compaction_job.h"
 #include "db/db_info_dumper.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
@@ -37,7 +36,7 @@
 #include "db/external_sst_file_ingestion_job.h"
 #include "db/flush_job.h"
 #include "db/forward_iterator.h"
-#include "db/in_memory_stats_history.h"
+#include "db/import_column_family_job.h"
 #include "db/job_context.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -53,10 +52,19 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "db/write_callback.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "memtable/hash_linklist_rep.h"
 #include "memtable/hash_skiplist_rep.h"
+#include "monitoring/in_memory_stats_history.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
 #include "options/cf_options.h"
@@ -74,33 +82,29 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "table/block.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/get_context.h"
 #include "table/merging_iterator.h"
 #include "table/multiget_context.h"
 #include "table/table_builder.h"
 #include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
 #include "tools/sst_dump_tool_imp.h"
-#include "util/auto_roll_logger.h"
 #include "util/autovector.h"
 #include "util/build_version.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
-#include "util/file_util.h"
-#include "util/filename.h"
-#include "util/log_buffer.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 const std::string kDefaultColumnFamilyName("default");
+const std::string kPersistentStatsColumnFamilyName(
+    "___rocksdb_stats_history___");
 void DumpRocksDBBuildVersion(Logger* log);
 
 CompressionType GetCompressionFlush(
@@ -161,10 +165,12 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       batch_per_txn_(batch_per_txn),
       db_lock_(nullptr),
       shutting_down_(false),
+      manual_compaction_paused_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
       log_dir_synced_(false),
       log_empty_(true),
+      persist_stats_cf_handle_(nullptr),
       log_sync_cv_(&mutex_),
       total_log_size_(0),
       is_snapshot_supported_(true),
@@ -235,12 +241,15 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
   const int table_cache_size = (mutable_db_options_.max_open_files == -1)
                                    ? TableCache::kInfiniteCapacity
                                    : mutable_db_options_.max_open_files - 10;
-  table_cache_ = NewLRUCache(table_cache_size,
-                             immutable_db_options_.table_cache_numshardbits);
+  LRUCacheOptions co;
+  co.capacity = table_cache_size;
+  co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_cache_ = NewLRUCache(co);
 
   versions_.reset(new VersionSet(dbname_, &immutable_db_options_, env_options_,
                                  table_cache_.get(), write_buffer_manager_,
-                                 &write_controller_));
+                                 &write_controller_, &block_cache_tracer_));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
@@ -468,6 +477,7 @@ Status DBImpl::CloseHelper() {
                            &files_grabbed_for_purge_);
   EraseThreadStatusDbInfo();
   flush_scheduler_.Clear();
+  trim_history_scheduler_.Clear();
 
   while (!flush_queue_.empty()) {
     const FlushRequest& flush_req = PopFirstFromFlushQueue();
@@ -485,10 +495,17 @@ Status DBImpl::CloseHelper() {
     }
   }
 
-  if (default_cf_handle_ != nullptr) {
+  if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
     // we need to delete handle outside of lock because it does its own locking
     mutex_.Unlock();
-    delete default_cf_handle_;
+    if (default_cf_handle_) {
+      delete default_cf_handle_;
+      default_cf_handle_ = nullptr;
+    }
+    if (persist_stats_cf_handle_) {
+      delete persist_stats_cf_handle_;
+      persist_stats_cf_handle_ = nullptr;
+    }
     mutex_.Lock();
   }
 
@@ -582,6 +599,12 @@ Status DBImpl::CloseHelper() {
       ret = s;
     }
   }
+  if (ret.IsAborted()) {
+    // Reserve IsAborted() error for those where users didn't release
+    // certain resource and they can release them and come back and
+    // retry. In this case, we wrap this exception to something else.
+    return Status::Incomplete(ret.ToString());
+  }
   return ret;
 }
 
@@ -631,7 +654,7 @@ void DBImpl::StartTimedTasks() {
       if (!thread_dump_stats_) {
         thread_dump_stats_.reset(new rocksdb::RepeatableThread(
             [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
-            stats_dump_period_sec * 1000000));
+            static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond));
       }
     }
     stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec;
@@ -639,14 +662,14 @@ void DBImpl::StartTimedTasks() {
       if (!thread_persist_stats_) {
         thread_persist_stats_.reset(new rocksdb::RepeatableThread(
             [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
-            stats_persist_period_sec * 1000000));
+            static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond));
       }
     }
   }
 }
 
 // esitmate the total size of stats_history_
-size_t DBImpl::EstiamteStatsHistorySize() const {
+size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
   size_t size_total =
       sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
   if (stats_history_.size() == 0) return size_total;
@@ -668,7 +691,7 @@ void DBImpl::PersistStats() {
   if (shutdown_initiated_) {
     return;
   }
-  uint64_t now_micros = env_->NowMicros();
+  uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond;
   Statistics* statistics = immutable_db_options_.statistics.get();
   if (!statistics) {
     return;
@@ -679,12 +702,50 @@ void DBImpl::PersistStats() {
     stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
   }
 
-  // TODO(Zhongyi): also persist immutable_db_options_.statistics
-  {
-    std::map<std::string, uint64_t> stats_map;
-    if (!statistics->getTickerMap(&stats_map)) {
-      return;
+  std::map<std::string, uint64_t> stats_map;
+  if (!statistics->getTickerMap(&stats_map)) {
+    return;
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "------- PERSISTING STATS -------");
+
+  if (immutable_db_options_.persist_stats_to_disk) {
+    WriteBatch batch;
+    if (stats_slice_initialized_) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Reading %" ROCKSDB_PRIszt " stats from statistics\n",
+                     stats_slice_.size());
+      for (const auto& stat : stats_map) {
+        char key[100];
+        int length =
+            EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
+        // calculate the delta from last time
+        if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+          uint64_t delta = stat.second - stats_slice_[stat.first];
+          batch.Put(persist_stats_cf_handle_, Slice(key, std::min(100, length)),
+                    ToString(delta));
+        }
+      }
     }
+    stats_slice_initialized_ = true;
+    std::swap(stats_slice_, stats_map);
+    WriteOptions wo;
+    wo.low_pri = true;
+    wo.no_slowdown = true;
+    wo.sync = false;
+    Status s = Write(wo, &batch);
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing to persistent stats CF failed -- %s",
+                     s.ToString().c_str());
+    } else {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to persistent stats CF succeeded",
+                     stats_slice_.size(), now_seconds);
+    }
+    // TODO(Zhongyi): add purging for persisted data
+  } else {
     InstrumentedMutexLock l(&stats_history_mutex_);
     // calculate the delta from last time
     if (stats_slice_initialized_) {
@@ -694,20 +755,33 @@ void DBImpl::PersistStats() {
           stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
         }
       }
-      stats_history_[now_micros] = stats_delta;
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to in-memory stats history",
+                     stats_slice_.size(), now_seconds);
+      stats_history_[now_seconds] = stats_delta;
     }
     stats_slice_initialized_ = true;
     std::swap(stats_slice_, stats_map);
     TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
 
     // delete older stats snapshots to control memory consumption
-    bool purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit;
+    size_t stats_history_size = EstimateInMemoryStatsHistorySize();
+    bool purge_needed = stats_history_size > stats_history_size_limit;
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
     while (purge_needed && !stats_history_.empty()) {
       stats_history_.erase(stats_history_.begin());
-      purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit;
+      purge_needed =
+          EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
     }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
   }
-  // TODO: persist stats to disk
 #endif  // !ROCKSDB_LITE
 }
 
@@ -738,8 +812,13 @@ Status DBImpl::GetStatsHistory(
   if (!stats_iterator) {
     return Status::InvalidArgument("stats_iterator not preallocated.");
   }
-  stats_iterator->reset(
-      new InMemoryStatsHistoryIterator(start_time, end_time, this));
+  if (immutable_db_options_.persist_stats_to_disk) {
+    stats_iterator->reset(
+        new PersistentStatsHistoryIterator(start_time, end_time, this));
+  } else {
+    stats_iterator->reset(
+        new InMemoryStatsHistoryIterator(start_time, end_time, this));
+  }
   return (*stats_iterator)->status();
 }
 
@@ -792,6 +871,24 @@ void DBImpl::DumpStats() {
   PrintStatistics();
 }
 
+Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                           int max_entries_to_print,
+                                           std::string* out_str) {
+  auto* cfh =
+      static_cast_with_check<ColumnFamilyHandleImpl, ColumnFamilyHandle>(
+          column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
+  Version* version = super_version->current;
+
+  Status s =
+      version->TablesRangeTombstoneSummary(max_entries_to_print, out_str);
+
+  CleanupSuperVersion(super_version);
+  return s;
+}
+
 void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
   if (!job_context->logs_to_free.empty()) {
     for (auto l : job_context->logs_to_free) {
@@ -811,16 +908,6 @@ Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
   return ret_dir;
 }
 
-Directory* DBImpl::Directories::GetDataDir(size_t path_id) const {
-  assert(path_id < data_dirs_.size());
-  Directory* ret_dir = data_dirs_[path_id].get();
-  if (ret_dir == nullptr) {
-    // Should use db_dir_
-    return db_dir_.get();
-  }
-  return ret_dir;
-}
-
 Status DBImpl::SetOptions(
     ColumnFamilyHandle* column_family,
     const std::unordered_map<std::string, std::string>& options_map) {
@@ -842,8 +929,9 @@ Status DBImpl::SetOptions(
   Status persist_options_status;
   SuperVersionContext sv_context(/* create_superversion */ true);
   {
+    auto db_options = GetDBOptions();
     InstrumentedMutexLock l(&mutex_);
-    s = cfd->SetOptions(options_map);
+    s = cfd->SetOptions(db_options, options_map);
     if (s.ok()) {
       new_options = *cfd->GetLatestMutableCFOptions();
       // Append new version to recompute compaction score.
@@ -906,6 +994,25 @@ Status DBImpl::SetDBOptions(
     InstrumentedMutexLock l(&mutex_);
     s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
                                        &new_options);
+    if (new_options.bytes_per_sync == 0) {
+      new_options.bytes_per_sync = 1024 * 1024;
+    }
+    DBOptions new_db_options =
+        BuildDBOptions(immutable_db_options_, new_options);
+    if (s.ok()) {
+      s = ValidateOptions(new_db_options);
+    }
+    if (s.ok()) {
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->IsDropped()) {
+          auto cf_options = c->GetLatestCFOptions();
+          s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+    }
     if (s.ok()) {
       if (new_options.max_background_compactions >
           mutable_db_options_.max_background_compactions) {
@@ -923,7 +1030,8 @@ Status DBImpl::SetDBOptions(
         if (new_options.stats_dump_period_sec > 0) {
           thread_dump_stats_.reset(new rocksdb::RepeatableThread(
               [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
-              new_options.stats_dump_period_sec * 1000000));
+              static_cast<uint64_t>(new_options.stats_dump_period_sec) *
+                  kMicrosInSecond));
         } else {
           thread_dump_stats_.reset();
         }
@@ -938,7 +1046,8 @@ Status DBImpl::SetDBOptions(
         if (new_options.stats_persist_period_sec > 0) {
           thread_persist_stats_.reset(new rocksdb::RepeatableThread(
               [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
-              new_options.stats_persist_period_sec * 1000000));
+              static_cast<uint64_t>(new_options.stats_persist_period_sec) *
+                  kMicrosInSecond));
         } else {
           thread_persist_stats_.reset();
         }
@@ -950,15 +1059,12 @@ Status DBImpl::SetDBOptions(
                                           : new_options.max_open_files - 10);
       wal_changed = mutable_db_options_.wal_bytes_per_sync !=
                     new_options.wal_bytes_per_sync;
-      if (new_options.bytes_per_sync == 0) {
-        new_options.bytes_per_sync = 1024 * 1024;
-      }
       mutable_db_options_ = new_options;
-      env_options_for_compaction_ = EnvOptions(
-          BuildDBOptions(immutable_db_options_, mutable_db_options_));
+      env_options_for_compaction_ = EnvOptions(new_db_options);
       env_options_for_compaction_ = env_->OptimizeForCompactionTableWrite(
           env_options_for_compaction_, immutable_db_options_);
       versions_->ChangeEnvOptions(mutable_db_options_);
+      //TODO(xiez): clarify why apply optimize for read to write options
       env_options_for_compaction_ = env_->OptimizeForCompactionTableRead(
           env_options_for_compaction_, immutable_db_options_);
       env_options_for_compaction_.compaction_readahead_size =
@@ -1026,10 +1132,13 @@ int DBImpl::FindMinimumEmptyLevelFitting(
 
 Status DBImpl::FlushWAL(bool sync) {
   if (manual_wal_flush_) {
-    // We need to lock log_write_mutex_ since logs_ might change concurrently
-    InstrumentedMutexLock wl(&log_write_mutex_);
-    log::Writer* cur_log_writer = logs_.back().writer;
-    auto s = cur_log_writer->WriteBuffer();
+    Status s;
+    {
+      // We need to lock log_write_mutex_ since logs_ might change concurrently
+      InstrumentedMutexLock wl(&log_write_mutex_);
+      log::Writer* cur_log_writer = logs_.back().writer;
+      s = cur_log_writer->WriteBuffer();
+    }
     if (!s.ok()) {
       ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
                       s.ToString().c_str());
@@ -1205,32 +1314,28 @@ void DBImpl::SchedulePurge() {
 void DBImpl::BackgroundCallPurge() {
   mutex_.Lock();
 
-  // We use one single loop to clear both queues so that after existing the loop
-  // both queues are empty. This is stricter than what is needed, but can make
-  // it easier for us to reason the correctness.
-  while (!purge_queue_.empty() || !logs_to_free_queue_.empty()) {
-    // Check logs_to_free_queue_ first and close log writers.
-    if (!logs_to_free_queue_.empty()) {
-      assert(!logs_to_free_queue_.empty());
-      log::Writer* log_writer = *(logs_to_free_queue_.begin());
-      logs_to_free_queue_.pop_front();
-      mutex_.Unlock();
-      delete log_writer;
-      mutex_.Lock();
-    } else {
-      auto purge_file = purge_queue_.begin();
-      auto fname = purge_file->fname;
-      auto dir_to_sync = purge_file->dir_to_sync;
-      auto type = purge_file->type;
-      auto number = purge_file->number;
-      auto job_id = purge_file->job_id;
-      purge_queue_.pop_front();
+  while (!logs_to_free_queue_.empty()) {
+    assert(!logs_to_free_queue_.empty());
+    log::Writer* log_writer = *(logs_to_free_queue_.begin());
+    logs_to_free_queue_.pop_front();
+    mutex_.Unlock();
+    delete log_writer;
+    mutex_.Lock();
+  }
+  for (const auto& file : purge_files_) {
+    const PurgeFileInfo& purge_file = file.second;
+    const std::string& fname = purge_file.fname;
+    const std::string& dir_to_sync = purge_file.dir_to_sync;
+    FileType type = purge_file.type;
+    uint64_t number = purge_file.number;
+    int job_id = purge_file.job_id;
 
-      mutex_.Unlock();
-      DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
-      mutex_.Lock();
-    }
+    mutex_.Unlock();
+    DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
+    mutex_.Lock();
   }
+  purge_files_.clear();
+
   bg_purge_scheduled_--;
 
   bg_cv_.SignalAll();
@@ -1350,22 +1455,29 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
   return default_cf_handle_;
 }
 
+ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
+  return persist_stats_cf_handle_;
+}
+
 Status DBImpl::Get(const ReadOptions& read_options,
                    ColumnFamilyHandle* column_family, const Slice& key,
                    PinnableSlice* value) {
-  return GetImpl(read_options, column_family, key, value);
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = value;
+  return GetImpl(read_options, key, get_impl_options);
 }
 
-Status DBImpl::GetImpl(const ReadOptions& read_options,
-                       ColumnFamilyHandle* column_family, const Slice& key,
-                       PinnableSlice* pinnable_val, bool* value_found,
-                       ReadCallback* callback, bool* is_blob_index) {
-  assert(pinnable_val != nullptr);
+Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
+                       GetImplOptions get_impl_options) {
+  assert(get_impl_options.value != nullptr ||
+         get_impl_options.merge_operands != nullptr);
   PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
   StopWatch sw(env_, stats_, DB_GET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfh =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(get_impl_options.column_family);
   auto cfd = cfh->cfd();
 
   if (tracer_) {
@@ -1373,7 +1485,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
     // tracing is enabled.
     InstrumentedMutexLock lock(&trace_mutex_);
     if (tracer_) {
-      tracer_->Get(column_family, key);
+      tracer_->Get(get_impl_options.column_family, key);
     }
   }
 
@@ -1385,9 +1497,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
 
   SequenceNumber snapshot;
   if (read_options.snapshot != nullptr) {
-    if (callback) {
+    if (get_impl_options.callback) {
       // Already calculated based on read_options.snapshot
-      snapshot = callback->max_visible_seq();
+      snapshot = get_impl_options.callback->max_visible_seq();
     } else {
       snapshot =
           reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
@@ -1401,8 +1513,23 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
     snapshot = last_seq_same_as_publish_seq_
                    ? versions_->LastSequence()
                    : versions_->LastPublishedSequence();
-    if (callback) {
-      callback->Refresh(snapshot);
+    if (get_impl_options.callback) {
+      // The unprep_seqs are not published for write unprepared, so it could be
+      // that max_visible_seq is larger. Seek to the std::max of the two.
+      // However, we still want our callback to contain the actual snapshot so
+      // that it can do the correct visibility filtering.
+      get_impl_options.callback->Refresh(snapshot);
+
+      // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+      // max_visible_seq = max(max_visible_seq, snapshot)
+      //
+      // Currently, the commented out assert is broken by
+      // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+      // the regular transaction flow, then this special read callback would not
+      // be needed.
+      //
+      // assert(callback->max_visible_seq() >= snapshot);
+      snapshot = get_impl_options.callback->max_visible_seq();
     }
   }
   TEST_SYNC_POINT("DBImpl::GetImpl:3");
@@ -1416,26 +1543,46 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   // First look in the memtable, then in the immutable memtable (if any).
   // s is both in/out. When in, s could either be OK or MergeInProgress.
   // merge_operands will contain the sequence of merges in the latter case.
-  LookupKey lkey(key, snapshot);
+  LookupKey lkey(key, snapshot, read_options.timestamp);
   PERF_TIMER_STOP(get_snapshot_time);
 
   bool skip_memtable = (read_options.read_tier == kPersistedTier &&
                         has_unpersisted_data_.load(std::memory_order_relaxed));
   bool done = false;
   if (!skip_memtable) {
-    if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                     &max_covering_tombstone_seq, read_options, callback,
-                     is_blob_index)) {
-      done = true;
-      pinnable_val->PinSelf();
-      RecordTick(stats_, MEMTABLE_HIT);
-    } else if ((s.ok() || s.IsMergeInProgress()) &&
-               sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                            &max_covering_tombstone_seq, read_options, callback,
-                            is_blob_index)) {
-      done = true;
-      pinnable_val->PinSelf();
-      RecordTick(stats_, MEMTABLE_HIT);
+    // Get value associated with key
+    if (get_impl_options.get_value) {
+      if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), &s,
+                       &merge_context, &max_covering_tombstone_seq,
+                       read_options, get_impl_options.callback,
+                       get_impl_options.is_blob_index)) {
+        done = true;
+        get_impl_options.value->PinSelf();
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->Get(lkey, get_impl_options.value->GetSelf(), &s,
+                              &merge_context, &max_covering_tombstone_seq,
+                              read_options, get_impl_options.callback,
+                              get_impl_options.is_blob_index)) {
+        done = true;
+        get_impl_options.value->PinSelf();
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    } else {
+      // Get Merge Operands associated with key, Merge Operands should not be
+      // merged and raw values should be returned to the user.
+      if (sv->mem->Get(lkey, nullptr, &s, &merge_context,
+                       &max_covering_tombstone_seq, read_options, nullptr,
+                       nullptr, false)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->GetMergeOperands(lkey, &s, &merge_context,
+                                           &max_covering_tombstone_seq,
+                                           read_options)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
     }
     if (!done && !s.ok() && !s.IsMergeInProgress()) {
       ReturnAndCleanupSuperVersion(cfd, sv);
@@ -1444,9 +1591,14 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   }
   if (!done) {
     PERF_TIMER_GUARD(get_from_output_files_time);
-    sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context,
-                     &max_covering_tombstone_seq, value_found, nullptr, nullptr,
-                     callback, is_blob_index);
+    sv->current->Get(
+        read_options, lkey, get_impl_options.value, &s, &merge_context,
+        &max_covering_tombstone_seq,
+        get_impl_options.get_value ? get_impl_options.value_found : nullptr,
+        nullptr, nullptr,
+        get_impl_options.get_value ? get_impl_options.callback : nullptr,
+        get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr,
+        get_impl_options.get_value);
     RecordTick(stats_, MEMTABLE_MISS);
   }
 
@@ -1458,7 +1610,25 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
     RecordTick(stats_, NUMBER_KEYS_READ);
     size_t size = 0;
     if (s.ok()) {
-      size = pinnable_val->size();
+      if (get_impl_options.get_value) {
+        size = get_impl_options.value->size();
+      } else {
+        // Return all merge operands for get_impl_options.key
+        *get_impl_options.number_of_operands =
+            static_cast<int>(merge_context.GetNumOperands());
+        if (*get_impl_options.number_of_operands >
+            get_impl_options.get_merge_operands_options
+                ->expected_max_number_of_operands) {
+          s = Status::Incomplete(
+              Status::SubCode::KMergeOperandsInsufficientCapacity);
+        } else {
+          for (const Slice& sl : merge_context.GetOperands()) {
+            size += sl.size();
+            get_impl_options.merge_operands->PinSelf(sl);
+            get_impl_options.merge_operands++;
+          }
+        }
+      }
       RecordTick(stats_, BYTES_READ, size);
       PERF_COUNTER_ADD(get_read_bytes, size);
     }
@@ -1475,14 +1645,9 @@ std::vector<Status> DBImpl::MultiGet(
   StopWatch sw(env_, stats_, DB_MULTIGET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
-  SequenceNumber snapshot;
+  SequenceNumber consistent_seqnum;
+  ;
 
-  struct MultiGetColumnFamilyData {
-    ColumnFamilyData* cfd;
-    SuperVersion* super_version;
-    MultiGetColumnFamilyData(ColumnFamilyData* cf, SuperVersion* sv)
-        : cfd(cf), super_version(sv) {}
-  };
   std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
       column_family.size());
   for (auto cf : column_family) {
@@ -1490,86 +1655,20 @@ std::vector<Status> DBImpl::MultiGet(
     auto cfd = cfh->cfd();
     if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
       multiget_cf_data.emplace(cfd->GetID(),
-                               MultiGetColumnFamilyData(cfd, nullptr));
+                               MultiGetColumnFamilyData(cfh, nullptr));
     }
   }
 
-  bool last_try = false;
-  {
-    // If we end up with the same issue of memtable geting sealed during 2
-    // consecutive retries, it means the write rate is very high. In that case
-    // its probably ok to take the mutex on the 3rd try so we can succeed for
-    // sure
-    static const int num_retries = 3;
-    for (auto i = 0; i < num_retries; ++i) {
-      last_try = (i == num_retries - 1);
-      bool retry = false;
+  std::function<MultiGetColumnFamilyData*(
+      std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&)>
+      iter_deref_lambda =
+          [](std::unordered_map<uint32_t, MultiGetColumnFamilyData>::iterator&
+                 cf_iter) { return &cf_iter->second; };
 
-      if (i > 0) {
-        for (auto mgd_iter = multiget_cf_data.begin();
-             mgd_iter != multiget_cf_data.end(); ++mgd_iter) {
-          auto super_version = mgd_iter->second.super_version;
-          auto cfd = mgd_iter->second.cfd;
-          if (super_version != nullptr) {
-            ReturnAndCleanupSuperVersion(cfd, super_version);
-          }
-          mgd_iter->second.super_version = nullptr;
-        }
-      }
-
-      if (read_options.snapshot == nullptr) {
-        if (last_try) {
-          TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
-          // We're close to max number of retries. For the last retry,
-          // acquire the lock so we're sure to succeed
-          mutex_.Lock();
-        }
-        snapshot = last_seq_same_as_publish_seq_
-                       ? versions_->LastSequence()
-                       : versions_->LastPublishedSequence();
-      } else {
-        snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
-                       ->number_;
-      }
-
-      for (auto mgd_iter = multiget_cf_data.begin();
-           mgd_iter != multiget_cf_data.end(); ++mgd_iter) {
-        if (!last_try) {
-          mgd_iter->second.super_version =
-              GetAndRefSuperVersion(mgd_iter->second.cfd);
-        } else {
-          mgd_iter->second.super_version =
-              mgd_iter->second.cfd->GetSuperVersion()->Ref();
-        }
-        TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
-        if (read_options.snapshot != nullptr || last_try) {
-          // If user passed a snapshot, then we don't care if a memtable is
-          // sealed or compaction happens because the snapshot would ensure
-          // that older key versions are kept around. If this is the last
-          // retry, then we have the lock so nothing bad can happen
-          continue;
-        }
-        // We could get the earliest sequence number for the whole list of
-        // memtables, which will include immutable memtables as well, but that
-        // might be tricky to maintain in case we decide, in future, to do
-        // memtable compaction.
-        if (!last_try) {
-          auto seq =
-              mgd_iter->second.super_version->mem->GetEarliestSequenceNumber();
-          if (seq > snapshot) {
-            retry = true;
-            break;
-          }
-        }
-      }
-      if (!retry) {
-        if (last_try) {
-          mutex_.Unlock();
-        }
-        break;
-      }
-    }
-  }
+  bool unref_only =
+      MultiCFSnapshot<std::unordered_map<uint32_t, MultiGetColumnFamilyData>>(
+          read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+          &consistent_seqnum);
 
   // Contain a list of merge operations if merge occurs.
   MergeContext merge_context;
@@ -1593,7 +1692,7 @@ std::vector<Status> DBImpl::MultiGet(
     Status& s = stat_list[i];
     std::string* value = &(*values)[i];
 
-    LookupKey lkey(keys[i], snapshot);
+    LookupKey lkey(keys[i], consistent_seqnum);
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
     SequenceNumber max_covering_tombstone_seq = 0;
     auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
@@ -1637,7 +1736,7 @@ std::vector<Status> DBImpl::MultiGet(
 
   for (auto mgd_iter : multiget_cf_data) {
     auto mgd = mgd_iter.second;
-    if (!last_try) {
+    if (!unref_only) {
       ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
     } else {
       mgd.cfd->GetSuperVersion()->Unref();
@@ -1654,107 +1753,333 @@ std::vector<Status> DBImpl::MultiGet(
   return stat_list;
 }
 
+template <class T>
+bool DBImpl::MultiCFSnapshot(
+    const ReadOptions& read_options, ReadCallback* callback,
+    std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+        iter_deref_func,
+    T* cf_list, SequenceNumber* snapshot) {
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  bool last_try = false;
+  if (cf_list->size() == 1) {
+    // Fast path for a single column family. We can simply get the thread loca
+    // super version
+    auto cf_iter = cf_list->begin();
+    auto node = iter_deref_func(cf_iter);
+    node->super_version = GetAndRefSuperVersion(node->cfd);
+    if (read_options.snapshot != nullptr) {
+      // Note: In WritePrepared txns this is not necessary but not harmful
+      // either.  Because prep_seq > snapshot => commit_seq > snapshot so if
+      // a snapshot is specified we should be fine with skipping seq numbers
+      // that are greater than that.
+      //
+      // In WriteUnprepared, we cannot set snapshot in the lookup key because we
+      // may skip uncommitted data that should be visible to the transaction for
+      // reading own writes.
+      *snapshot =
+          static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+      if (callback) {
+        *snapshot = std::max(*snapshot, callback->max_visible_seq());
+      }
+    } else {
+      // Since we get and reference the super version before getting
+      // the snapshot number, without a mutex protection, it is possible
+      // that a memtable switch happened in the middle and not all the
+      // data for this snapshot is available. But it will contain all
+      // the data available in the super version we have, which is also
+      // a valid snapshot to read from.
+      // We shouldn't get snapshot before finding and referencing the super
+      // version because a flush happening in between may compact away data for
+      // the snapshot, but the snapshot is earlier than the data overwriting it,
+      // so users may see wrong results.
+      *snapshot = last_seq_same_as_publish_seq_
+                      ? versions_->LastSequence()
+                      : versions_->LastPublishedSequence();
+    }
+  } else {
+    // If we end up with the same issue of memtable geting sealed during 2
+    // consecutive retries, it means the write rate is very high. In that case
+    // its probably ok to take the mutex on the 3rd try so we can succeed for
+    // sure
+    static const int num_retries = 3;
+    for (int i = 0; i < num_retries; ++i) {
+      last_try = (i == num_retries - 1);
+      bool retry = false;
+
+      if (i > 0) {
+        for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+             ++cf_iter) {
+          auto node = iter_deref_func(cf_iter);
+          SuperVersion* super_version = node->super_version;
+          ColumnFamilyData* cfd = node->cfd;
+          if (super_version != nullptr) {
+            ReturnAndCleanupSuperVersion(cfd, super_version);
+          }
+          node->super_version = nullptr;
+        }
+      }
+      if (read_options.snapshot == nullptr) {
+        if (last_try) {
+          TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
+          // We're close to max number of retries. For the last retry,
+          // acquire the lock so we're sure to succeed
+          mutex_.Lock();
+        }
+        *snapshot = last_seq_same_as_publish_seq_
+                        ? versions_->LastSequence()
+                        : versions_->LastPublishedSequence();
+      } else {
+        *snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                        ->number_;
+      }
+      for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+           ++cf_iter) {
+        auto node = iter_deref_func(cf_iter);
+        if (!last_try) {
+          node->super_version = GetAndRefSuperVersion(node->cfd);
+        } else {
+          node->super_version = node->cfd->GetSuperVersion()->Ref();
+        }
+        TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
+        if (read_options.snapshot != nullptr || last_try) {
+          // If user passed a snapshot, then we don't care if a memtable is
+          // sealed or compaction happens because the snapshot would ensure
+          // that older key versions are kept around. If this is the last
+          // retry, then we have the lock so nothing bad can happen
+          continue;
+        }
+        // We could get the earliest sequence number for the whole list of
+        // memtables, which will include immutable memtables as well, but that
+        // might be tricky to maintain in case we decide, in future, to do
+        // memtable compaction.
+        if (!last_try) {
+          SequenceNumber seq =
+              node->super_version->mem->GetEarliestSequenceNumber();
+          if (seq > *snapshot) {
+            retry = true;
+            break;
+          }
+        }
+      }
+      if (!retry) {
+        if (last_try) {
+          mutex_.Unlock();
+        }
+        break;
+      }
+    }
+  }
+
+  // Keep track of bytes that we read for statistics-recording later
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  return last_try;
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+                      ColumnFamilyHandle** column_families, const Slice* keys,
+                      PinnableSlice* values, Status* statuses,
+                      const bool sorted_input) {
+  if (num_keys == 0) {
+    return;
+  }
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    key_context.emplace_back(column_families[i], keys[i], &values[i],
+                             &statuses[i]);
+  }
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+
+  autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
+      multiget_cf_data;
+  size_t cf_start = 0;
+  ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
+  for (size_t i = 0; i < num_keys; ++i) {
+    KeyContext* key_ctx = sorted_keys[i];
+    if (key_ctx->column_family != cf) {
+      multiget_cf_data.emplace_back(
+          MultiGetColumnFamilyData(cf, cf_start, i - cf_start, nullptr));
+      cf_start = i;
+      cf = key_ctx->column_family;
+    }
+  }
+  {
+    // multiget_cf_data.emplace_back(
+    // MultiGetColumnFamilyData(cf, cf_start, num_keys - cf_start, nullptr));
+    multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
+  }
+  std::function<MultiGetColumnFamilyData*(
+      autovector<MultiGetColumnFamilyData,
+                 MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
+      iter_deref_lambda =
+          [](autovector<MultiGetColumnFamilyData,
+                        MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<
+      autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
+      read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
+
+  for (auto cf_iter = multiget_cf_data.begin();
+       cf_iter != multiget_cf_data.end(); ++cf_iter) {
+    MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, &sorted_keys,
+                 cf_iter->super_version, consistent_seqnum, nullptr, nullptr);
+    if (!unref_only) {
+      ReturnAndCleanupSuperVersion(cf_iter->cfd, cf_iter->super_version);
+    } else {
+      cf_iter->cfd->GetSuperVersion()->Unref();
+    }
+  }
+}
+
+namespace {
 // Order keys by CF ID, followed by key contents
 struct CompareKeyContext {
   inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
-    const Comparator* comparator = cfd->user_comparator();
+    ColumnFamilyHandleImpl* cfh =
+        static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+    uint32_t cfd_id1 = cfh->cfd()->GetID();
+    const Comparator* comparator = cfh->cfd()->user_comparator();
+    cfh = static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+    uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+    if (cfd_id1 < cfd_id2) {
+      return true;
+    } else if (cfd_id1 > cfd_id2) {
+      return false;
+    }
+
+    // Both keys are from the same column family
     int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
     if (cmp < 0) {
       return true;
     }
     return false;
   }
-  const ColumnFamilyData* cfd;
 };
 
+}  // anonymous namespace
+
+void DBImpl::PrepareMultiGetKeys(
+    size_t num_keys, bool sorted_input,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+#ifndef NDEBUG
+  if (sorted_input) {
+    for (size_t index = 0; index < sorted_keys->size(); ++index) {
+      if (index > 0) {
+        KeyContext* lhs = (*sorted_keys)[index - 1];
+        KeyContext* rhs = (*sorted_keys)[index];
+        ColumnFamilyHandleImpl* cfh =
+            reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+        uint32_t cfd_id1 = cfh->cfd()->GetID();
+        const Comparator* comparator = cfh->cfd()->user_comparator();
+        cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+        uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+        assert(cfd_id1 <= cfd_id2);
+        if (cfd_id1 < cfd_id2) {
+          continue;
+        }
+
+        // Both keys are from the same column family
+        int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
+        assert(cmp <= 0);
+      }
+      index++;
+    }
+  }
+#endif
+  if (!sorted_input) {
+    CompareKeyContext sort_comparator;
+    std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
+              sort_comparator);
+  }
+}
+
 void DBImpl::MultiGet(const ReadOptions& read_options,
                       ColumnFamilyHandle* column_family, const size_t num_keys,
                       const Slice* keys, PinnableSlice* values,
                       Status* statuses, const bool sorted_input) {
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
   for (size_t i = 0; i < num_keys; ++i) {
-    key_context.emplace_back(keys[i], &values[i], &statuses[i]);
+    key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
   }
-
-  MultiGetImpl(read_options, column_family, key_context, sorted_input, nullptr,
-               nullptr);
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+  MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
 }
 
-void DBImpl::MultiGetImpl(
+void DBImpl::MultiGetWithCallback(
     const ReadOptions& read_options, ColumnFamilyHandle* column_family,
-    autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE>& key_context,
-    bool sorted_input, ReadCallback* callback, bool* is_blob_index) {
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_MULTIGET);
-  size_t num_keys = key_context.size();
-
-  PERF_TIMER_GUARD(get_snapshot_time);
-
-  ColumnFamilyHandleImpl* cfh =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  ColumnFamilyData* cfd = cfh->cfd();
-
-  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
-  sorted_keys.resize(num_keys);
-  {
-    size_t index = 0;
-    for (KeyContext& key : key_context) {
+    ReadCallback* callback,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+  std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
+  multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
+  std::function<MultiGetColumnFamilyData*(
+      std::array<MultiGetColumnFamilyData, 1>::iterator&)>
+      iter_deref_lambda =
+          [](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  size_t num_keys = sorted_keys->size();
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
+      read_options, callback, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
 #ifndef NDEBUG
-      if (index > 0 && sorted_input) {
-        KeyContext* lhs = &key_context[index-1];
-        KeyContext* rhs = &key_context[index];
-        const Comparator* comparator = cfd->user_comparator();
-        int cmp = comparator->Compare(*(lhs->key), *(rhs->key));
-        assert(cmp <= 0);
-      }
-#endif
-
-      sorted_keys[index] = &key;
-      index++;
-    }
-    if (!sorted_input) {
-      CompareKeyContext sort_comparator;
-      sort_comparator.cfd = cfd;
-      std::sort(sorted_keys.begin(), sorted_keys.begin() + index,
-                sort_comparator);
-    }
+  assert(!unref_only);
+#else
+  // Silence unused variable warning
+  (void)unref_only;
+#endif  // NDEBUG
+
+  if (callback && read_options.snapshot == nullptr) {
+    // The unprep_seqs are not published for write unprepared, so it could be
+    // that max_visible_seq is larger. Seek to the std::max of the two.
+    // However, we still want our callback to contain the actual snapshot so
+    // that it can do the correct visibility filtering.
+    callback->Refresh(consistent_seqnum);
+
+    // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+    // max_visible_seq = max(max_visible_seq, snapshot)
+    //
+    // Currently, the commented out assert is broken by
+    // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+    // the regular transaction flow, then this special read callback would not
+    // be needed.
+    //
+    // assert(callback->max_visible_seq() >= snapshot);
+    consistent_seqnum = callback->max_visible_seq();
   }
 
-  // Keep track of bytes that we read for statistics-recording later
-  PERF_TIMER_STOP(get_snapshot_time);
+  MultiGetImpl(read_options, 0, num_keys, sorted_keys,
+               multiget_cf_data[0].super_version, consistent_seqnum, nullptr,
+               nullptr);
+  ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
+                               multiget_cf_data[0].super_version);
+}
 
-  // Acquire SuperVersion
-  SuperVersion* super_version = GetAndRefSuperVersion(cfd);
-  SequenceNumber snapshot;
-  if (read_options.snapshot != nullptr) {
-    // Note: In WritePrepared txns this is not necessary but not harmful
-    // either.  Because prep_seq > snapshot => commit_seq > snapshot so if
-    // a snapshot is specified we should be fine with skipping seq numbers
-    // that are greater than that.
-    //
-    // In WriteUnprepared, we cannot set snapshot in the lookup key because we
-    // may skip uncommitted data that should be visible to the transaction for
-    // reading own writes.
-    snapshot =
-        reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
-    if (callback) {
-      snapshot = std::max(snapshot, callback->max_visible_seq());
-    }
-  } else {
-    // Since we get and reference the super version before getting
-    // the snapshot number, without a mutex protection, it is possible
-    // that a memtable switch happened in the middle and not all the
-    // data for this snapshot is available. But it will contain all
-    // the data available in the super version we have, which is also
-    // a valid snapshot to read from.
-    // We shouldn't get snapshot before finding and referencing the super
-    // version because a flush happening in between may compact away data for
-    // the snapshot, but the snapshot is earlier than the data overwriting it,
-    // so users may see wrong results.
-    snapshot = last_seq_same_as_publish_seq_
-                   ? versions_->LastSequence()
-                   : versions_->LastPublishedSequence();
-  }
+void DBImpl::MultiGetImpl(
+    const ReadOptions& read_options, size_t start_key, size_t num_keys,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+    SuperVersion* super_version, SequenceNumber snapshot,
+    ReadCallback* callback, bool* is_blob_index) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+  StopWatch sw(env_, stats_, DB_MULTIGET);
 
   // For each of the given keys, apply the entire "get" process as follows:
   // First look in the memtable, then in the immutable memtable (if any).
@@ -1765,49 +2090,34 @@ void DBImpl::MultiGetImpl(
     size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
                             ? MultiGetContext::MAX_BATCH_SIZE
                             : keys_left;
-    MultiGetContext ctx(&sorted_keys[num_keys - keys_left], batch_size,
-                        snapshot);
+    MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
+                        batch_size, snapshot);
     MultiGetRange range = ctx.GetMultiGetRange();
     bool lookup_current = false;
 
     keys_left -= batch_size;
     for (auto mget_iter = range.begin(); mget_iter != range.end();
          ++mget_iter) {
-      MergeContext& merge_context = mget_iter->merge_context;
-      merge_context.Clear();
-      Status& s = *mget_iter->s;
-      PinnableSlice* value = mget_iter->value;
-      s = Status::OK();
+      mget_iter->merge_context.Clear();
+      *mget_iter->s = Status::OK();
+    }
 
-      bool skip_memtable =
-          (read_options.read_tier == kPersistedTier &&
-           has_unpersisted_data_.load(std::memory_order_relaxed));
-      bool done = false;
-      if (!skip_memtable) {
-        if (super_version->mem->Get(*(mget_iter->lkey), value->GetSelf(), &s,
-                                    &merge_context,
-                                    &mget_iter->max_covering_tombstone_seq,
-                                    read_options, callback, is_blob_index)) {
-          done = true;
-          value->PinSelf();
-          RecordTick(stats_, MEMTABLE_HIT);
-        } else if (super_version->imm->Get(
-                       *(mget_iter->lkey), value->GetSelf(), &s, &merge_context,
-                       &mget_iter->max_covering_tombstone_seq, read_options,
-                       callback, is_blob_index)) {
-          done = true;
-          value->PinSelf();
-          RecordTick(stats_, MEMTABLE_HIT);
-        }
+    bool skip_memtable =
+        (read_options.read_tier == kPersistedTier &&
+         has_unpersisted_data_.load(std::memory_order_relaxed));
+    if (!skip_memtable) {
+      super_version->mem->MultiGet(read_options, &range, callback,
+                                   is_blob_index);
+      if (!range.empty()) {
+        super_version->imm->MultiGet(read_options, &range, callback,
+                                     is_blob_index);
       }
-      if (done) {
-        range.MarkKeyDone(mget_iter);
-      } else {
-        RecordTick(stats_, MEMTABLE_MISS);
+      if (!range.empty()) {
         lookup_current = true;
+        uint64_t left = range.KeysLeft();
+        RecordTick(stats_, MEMTABLE_MISS, left);
       }
     }
-
     if (lookup_current) {
       PERF_TIMER_GUARD(get_from_output_files_time);
       super_version->current->MultiGet(read_options, &range, callback,
@@ -1819,15 +2129,14 @@ void DBImpl::MultiGetImpl(
   PERF_TIMER_GUARD(get_post_process_time);
   size_t num_found = 0;
   uint64_t bytes_read = 0;
-  for (KeyContext& key : key_context) {
-    if (key.s->ok()) {
-      bytes_read += key.value->size();
+  for (size_t i = start_key; i < start_key + num_keys; ++i) {
+    KeyContext* key = (*sorted_keys)[i];
+    if (key->s->ok()) {
+      bytes_read += key->value->size();
       num_found++;
     }
   }
 
-  ReturnAndCleanupSuperVersion(cfd, super_version);
-
   RecordTick(stats_, NUMBER_MULTIGET_CALLS);
   RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
   RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
@@ -1912,13 +2221,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
   Status persist_options_status;
   *handle = nullptr;
 
-  s = CheckCompressionSupported(cf_options);
-  if (s.ok() && immutable_db_options_.allow_concurrent_memtable_write) {
-    s = CheckConcurrentWritesSupported(cf_options);
-  }
-  if (s.ok()) {
-    s = CheckCFPathsSupported(initial_db_options_, cf_options);
-  }
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  s = ColumnFamilyData::ValidateOptions(db_options, cf_options);
   if (s.ok()) {
     for (auto& cf_path : cf_options.cf_paths) {
       s = env_->CreateDirIfMissing(cf_path.path);
@@ -2105,7 +2410,11 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
   ReadOptions roptions = read_options;
   roptions.read_tier = kBlockCacheTier;  // read from block cache only
   PinnableSlice pinnable_val;
-  auto s = GetImpl(roptions, column_family, key, &pinnable_val, value_found);
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = &pinnable_val;
+  get_impl_options.value_found = value_found;
+  auto s = GetImpl(roptions, key, get_impl_options);
   value->assign(pinnable_val.data(), pinnable_val.size());
 
   // If block_cache is enabled and the index block of the table didn't
@@ -2671,11 +2980,13 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
   ReturnAndCleanupSuperVersion(cfd, sv);
 }
 
-void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                 const Range* range, int n, uint64_t* sizes,
-                                 uint8_t include_flags) {
-  assert(include_flags & DB::SizeApproximationFlags::INCLUDE_FILES ||
-         include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES);
+Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
+                                   ColumnFamilyHandle* column_family,
+                                   const Range* range, int n, uint64_t* sizes) {
+  if (!options.include_memtabtles && !options.include_files) {
+    return Status::InvalidArgument("Invalid options");
+  }
+
   Version* v;
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
@@ -2687,16 +2998,19 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
     InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
     InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
     sizes[i] = 0;
-    if (include_flags & DB::SizeApproximationFlags::INCLUDE_FILES) {
-      sizes[i] += versions_->ApproximateSize(v, k1.Encode(), k2.Encode());
+    if (options.include_files) {
+      sizes[i] += versions_->ApproximateSize(
+          options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
+          /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
     }
-    if (include_flags & DB::SizeApproximationFlags::INCLUDE_MEMTABLES) {
+    if (options.include_memtabtles) {
       sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
       sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
     }
   }
 
   ReturnAndCleanupSuperVersion(cfd, sv);
+  return Status::OK();
 }
 
 std::list<uint64_t>::iterator
@@ -2711,8 +3025,11 @@ DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
 }
 
 void DBImpl::ReleaseFileNumberFromPendingOutputs(
-    std::list<uint64_t>::iterator v) {
-  pending_outputs_.erase(v);
+    std::unique_ptr<std::list<uint64_t>::iterator>& v) {
+  if (v.get() != nullptr) {
+    pending_outputs_.erase(*v.get());
+    v.reset();
+  }
 }
 
 #ifndef ROCKSDB_LITE
@@ -2924,7 +3241,19 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
   assert(column_family);
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
   auto* sv = GetAndRefSuperVersion(cfd);
-  sv->current->GetColumnFamilyMetaData(cf_meta);
+  {
+    // Without mutex, Version::GetColumnFamilyMetaData will have data race with
+    // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
+    // this may cause regression. An alternative is to make
+    // FileMetaData::being_compacted atomic, but it will make FileMetaData
+    // non-copy-able. Another option is to separate these variables from
+    // original FileMetaData struct, and this requires re-organization of data
+    // structures. For now, we take the easy approach. If
+    // DB::GetColumnFamilyMetaData is not called frequently, the regression
+    // should not be big. We still need to keep an eye on it.
+    InstrumentedMutexLock l(&mutex_);
+    sv->current->GetColumnFamilyMetaData(cf_meta);
+  }
   ReturnAndCleanupSuperVersion(cfd, sv);
 }
 
@@ -2934,6 +3263,7 @@ Status DBImpl::CheckConsistency() {
   mutex_.AssertHeld();
   std::vector<LiveFileMetaData> metadata;
   versions_->GetLiveFilesMetaData(&metadata);
+  TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
 
   std::string corruption_messages;
   for (const auto& md : metadata) {
@@ -2941,6 +3271,7 @@ Status DBImpl::CheckConsistency() {
     std::string file_path = md.db_path + md.name;
 
     uint64_t fsize = 0;
+    TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
     Status s = env_->GetFileSize(file_path, &fsize);
     if (!s.ok() &&
         env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
@@ -2964,6 +3295,11 @@ Status DBImpl::CheckConsistency() {
 }
 
 Status DBImpl::GetDbIdentity(std::string& identity) const {
+  identity.assign(db_id_);
+  return Status::OK();
+}
+
+Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
   std::string idfilename = IdentityFileName(dbname_);
   const EnvOptions soptions;
   std::unique_ptr<SequentialFileReader> id_file_reader;
@@ -2990,10 +3326,10 @@ Status DBImpl::GetDbIdentity(std::string& identity) const {
   if (!s.ok()) {
     return s;
   }
-  identity.assign(id.ToString());
+  identity->assign(id.ToString());
   // If last character is '\n' remove it from identity
-  if (identity.size() > 0 && identity.back() == '\n') {
-    identity.pop_back();
+  if (identity->size() > 0 && identity->back() == '\n') {
+    identity->pop_back();
   }
   return s;
 }
@@ -3036,6 +3372,14 @@ DB::~DB() {}
 
 Status DBImpl::Close() {
   if (!closed_) {
+    {
+      InstrumentedMutexLock l(&mutex_);
+      // If there is unreleased snapshot, fail the close call
+      if (!snapshots_.empty()) {
+        return Status::Aborted("Cannot close DB with unreleased snapshot.");
+      }
+    }
+
     closed_ = true;
     return CloseImpl();
   }
@@ -3055,6 +3399,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
   ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
   Env* env = soptions.env;
   std::vector<std::string> filenames;
+  bool wal_in_db_path = IsWalDirSameAsDBPath(&soptions);
 
   // Reset the logger because it holds a handle to the
   // log file and prevents cleanup and directory removal
@@ -3077,7 +3422,8 @@ Status DestroyDB(const std::string& dbname, const Options& options,
         if (type == kMetaDatabase) {
           del = DestroyDB(path_to_delete, options);
         } else if (type == kTableFile || type == kLogFile) {
-          del = DeleteDBFile(&soptions, path_to_delete, dbname);
+          del = DeleteDBFile(&soptions, path_to_delete, dbname,
+                             /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
         } else {
           del = env->DeleteFile(path_to_delete);
         }
@@ -3111,7 +3457,8 @@ Status DestroyDB(const std::string& dbname, const Options& options,
           if (ParseFileName(fname, &number, &type) &&
               type == kTableFile) {  // Lock file will be deleted at end
             std::string table_path = path + "/" + fname;
-            Status del = DeleteDBFile(&soptions, table_path, dbname);
+            Status del = DeleteDBFile(&soptions, table_path, dbname,
+                                      /*force_bg=*/false, /*force_fg=*/false);
             if (result.ok() && !del.ok()) {
               result = del;
             }
@@ -3138,7 +3485,8 @@ Status DestroyDB(const std::string& dbname, const Options& options,
       for (const auto& file : archiveFiles) {
         if (ParseFileName(file, &number, &type) && type == kLogFile) {
           Status del =
-              DeleteDBFile(&soptions, archivedir + "/" + file, archivedir);
+              DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
+                           /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
           if (result.ok() && !del.ok()) {
             result = del;
           }
@@ -3153,7 +3501,8 @@ Status DestroyDB(const std::string& dbname, const Options& options,
         if (ParseFileName(file, &number, &type) && type == kLogFile) {
           Status del =
               DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
-                           soptions.wal_dir);
+                           soptions.wal_dir, /*force_bg=*/false,
+                           /*force_fg=*/!wal_in_db_path);
           if (result.ok() && !del.ok()) {
             result = del;
           }
@@ -3370,7 +3719,9 @@ SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
 
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
-                                       bool cache_only, SequenceNumber* seq,
+                                       bool cache_only,
+                                       SequenceNumber lower_bound_seq,
+                                       SequenceNumber* seq,
                                        bool* found_record_for_key,
                                        bool* is_blob_index) {
   Status s;
@@ -3403,6 +3754,13 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
     return Status::OK();
   }
 
+  SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber();
+  if (lower_bound_in_mem != kMaxSequenceNumber &&
+      lower_bound_in_mem < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
   // Check if there is a record for this key in the immutable memtables
   sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
                seq, read_options, nullptr /*read_callback*/, is_blob_index);
@@ -3422,6 +3780,13 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
     return Status::OK();
   }
 
+  SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber();
+  if (lower_bound_in_imm != kMaxSequenceNumber &&
+      lower_bound_in_imm < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
   // Check if there is a record for this key in the immutable memtables
   sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context,
                           &max_covering_tombstone_seq, seq, read_options,
@@ -3443,6 +3808,10 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
     return Status::OK();
   }
 
+  // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true)
+  // check here to skip the history if possible. But currently the caller
+  // already does that. Maybe we should move the logic here later.
+
   // TODO(agiardullo): possible optimization: consider checking cached
   // SST files if cache_only=true?
   if (!cache_only) {
@@ -3511,7 +3880,7 @@ Status DBImpl::IngestExternalFiles(
 
   // TODO (yanqin) maybe handle the case in which column_families have
   // duplicates
-  std::list<uint64_t>::iterator pending_output_elem;
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
   size_t total = 0;
   for (const auto& arg : args) {
     total += arg.external_files.size();
@@ -3519,7 +3888,7 @@ Status DBImpl::IngestExternalFiles(
   uint64_t next_file_number = 0;
   Status status = ReserveFileNumbersBeforeIngestion(
       static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
-      &pending_output_elem, &next_file_number);
+      pending_output_elem, &next_file_number);
   if (!status.ok()) {
     InstrumentedMutexLock l(&mutex_);
     ReleaseFileNumberFromPendingOutputs(pending_output_elem);
@@ -3529,18 +3898,18 @@ Status DBImpl::IngestExternalFiles(
   std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
   for (const auto& arg : args) {
     auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
-    ingestion_jobs.emplace_back(env_, versions_.get(), cfd,
-                                immutable_db_options_, env_options_,
-                                &snapshots_, arg.options);
+    ingestion_jobs.emplace_back(
+        env_, versions_.get(), cfd, immutable_db_options_, env_options_,
+        &snapshots_, arg.options, &directories_, &event_logger_);
   }
   std::vector<std::pair<bool, Status>> exec_results;
   for (size_t i = 0; i != num_cfs; ++i) {
     exec_results.emplace_back(false, Status::OK());
   }
   // TODO(yanqin) maybe make jobs run in parallel
+  uint64_t start_file_number = next_file_number;
   for (size_t i = 1; i != num_cfs; ++i) {
-    uint64_t start_file_number =
-        next_file_number + args[i - 1].external_files.size();
+    start_file_number += args[i - 1].external_files.size();
     auto* cfd =
         static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
     SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
@@ -3661,19 +4030,21 @@ Status DBImpl::IngestExternalFiles(
       }
     }
     if (status.ok()) {
-      bool should_increment_last_seqno =
-          ingestion_jobs[0].ShouldIncrementLastSequence();
+      int consumed_seqno_count =
+          ingestion_jobs[0].ConsumedSequenceNumbersCount();
 #ifndef NDEBUG
       for (size_t i = 1; i != num_cfs; ++i) {
-        assert(should_increment_last_seqno ==
-               ingestion_jobs[i].ShouldIncrementLastSequence());
+        assert(!!consumed_seqno_count ==
+               !!ingestion_jobs[i].ConsumedSequenceNumbersCount());
+        consumed_seqno_count +=
+            ingestion_jobs[i].ConsumedSequenceNumbersCount();
       }
 #endif
-      if (should_increment_last_seqno) {
+      if (consumed_seqno_count > 0) {
         const SequenceNumber last_seqno = versions_->LastSequence();
-        versions_->SetLastAllocatedSequence(last_seqno + 1);
-        versions_->SetLastPublishedSequence(last_seqno + 1);
-        versions_->SetLastSequence(last_seqno + 1);
+        versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastSequence(last_seqno + consumed_seqno_count);
       }
       autovector<ColumnFamilyData*> cfds_to_commit;
       autovector<const MutableCFOptions*> mutable_cf_options_list;
@@ -3764,7 +4135,127 @@ Status DBImpl::IngestExternalFiles(
   return status;
 }
 
-Status DBImpl::VerifyChecksum() {
+Status DBImpl::CreateColumnFamilyWithImport(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    const ImportColumnFamilyOptions& import_options,
+    const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) {
+  assert(handle != nullptr);
+  assert(*handle == nullptr);
+  std::string cf_comparator_name = options.comparator->Name();
+  if (cf_comparator_name != metadata.db_comparator_name) {
+    return Status::InvalidArgument("Comparator name mismatch");
+  }
+
+  // Create column family.
+  auto status = CreateColumnFamily(options, column_family_name, handle);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Import sst files from metadata.
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(*handle);
+  auto cfd = cfh->cfd();
+  ImportColumnFamilyJob import_job(env_, versions_.get(), cfd,
+                                   immutable_db_options_, env_options_,
+                                   import_options, metadata.files);
+
+  SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
+  VersionEdit dummy_edit;
+  uint64_t next_file_number = 0;
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+  {
+    // Lock db mutex
+    InstrumentedMutexLock l(&mutex_);
+    if (error_handler_.IsDBStopped()) {
+      // Don't import files when there is a bg_error
+      status = error_handler_.GetBGError();
+    }
+
+    // Make sure that bg cleanup wont delete the files that we are importing
+    pending_output_elem.reset(new std::list<uint64_t>::iterator(
+        CaptureCurrentFileNumberInPendingOutputs()));
+
+    if (status.ok()) {
+      // If crash happen after a hard link established, Recover function may
+      // reuse the file number that has already assigned to the internal file,
+      // and this will overwrite the external file. To protect the external
+      // file, we have to make sure the file number will never being reused.
+      next_file_number = versions_->FetchAddFileNumber(metadata.files.size());
+      auto cf_options = cfd->GetLatestMutableCFOptions();
+      status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                                      directories_.GetDbDir());
+      if (status.ok()) {
+        InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+      }
+    }
+  }
+  dummy_sv_ctx.Clean();
+
+  if (status.ok()) {
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
+    status = import_job.Prepare(next_file_number, sv);
+    CleanupSuperVersion(sv);
+  }
+
+  if (status.ok()) {
+    SuperVersionContext sv_context(true /*create_superversion*/);
+    {
+      // Lock db mutex
+      InstrumentedMutexLock l(&mutex_);
+
+      // Stop writes to the DB by entering both write threads
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      WriteThread::Writer nonmem_w;
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+
+      num_running_ingest_file_++;
+      assert(!cfd->IsDropped());
+      status = import_job.Run();
+
+      // Install job edit [Mutex will be unlocked here]
+      if (status.ok()) {
+        auto cf_options = cfd->GetLatestMutableCFOptions();
+        status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(),
+                                        &mutex_, directories_.GetDbDir());
+        if (status.ok()) {
+          InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
+        }
+      }
+
+      // Resume writes to the DB
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+      write_thread_.ExitUnbatched(&w);
+
+      num_running_ingest_file_--;
+      if (num_running_ingest_file_ == 0) {
+        bg_cv_.SignalAll();
+      }
+    }
+    // mutex_ is unlocked here
+
+    sv_context.Clean();
+  }
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+  }
+
+  import_job.Cleanup(status);
+  if (!status.ok()) {
+    DropColumnFamily(*handle);
+    DestroyColumnFamilyHandle(*handle);
+    *handle = nullptr;
+  }
+  return status;
+}
+
+Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
   Status s;
   std::vector<ColumnFamilyData*> cfd_list;
   {
@@ -3795,7 +4286,8 @@ Status DBImpl::VerifyChecksum() {
         const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
         std::string fname = TableFileName(cfd->ioptions()->cf_paths,
                                           fd.GetNumber(), fd.GetPathId());
-        s = rocksdb::VerifySstFileChecksum(opts, env_options_, fname);
+        s = rocksdb::VerifySstFileChecksum(opts, env_options_, read_options,
+                                           fname);
       }
     }
     if (!s.ok()) {
@@ -3862,6 +4354,18 @@ Status DBImpl::EndTrace() {
   return s;
 }
 
+Status DBImpl::StartBlockCacheTrace(
+    const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer) {
+  return block_cache_tracer_.StartTrace(env_, trace_options,
+                                        std::move(trace_writer));
+}
+
+Status DBImpl::EndBlockCacheTrace() {
+  block_cache_tracer_.EndTrace();
+  return Status::OK();
+}
+
 Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
   Status s;
   if (tracer_) {
@@ -3887,18 +4391,18 @@ Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
 
 Status DBImpl::ReserveFileNumbersBeforeIngestion(
     ColumnFamilyData* cfd, uint64_t num,
-    std::list<uint64_t>::iterator* pending_output_elem,
+    std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
     uint64_t* next_file_number) {
   Status s;
   SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
-  assert(nullptr != pending_output_elem);
   assert(nullptr != next_file_number);
   InstrumentedMutexLock l(&mutex_);
   if (error_handler_.IsDBStopped()) {
     // Do not ingest files when there is a bg_error
     return error_handler_.GetBGError();
   }
-  *pending_output_elem = CaptureCurrentFileNumberInPendingOutputs();
+  pending_output_elem.reset(new std::list<uint64_t>::iterator(
+      CaptureCurrentFileNumberInPendingOutputs()));
   *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
   auto cf_options = cfd->GetLatestMutableCFOptions();
   VersionEdit dummy_edit;
@@ -3914,6 +4418,26 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion(
   dummy_sv_ctx.Clean();
   return s;
 }
+
+Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  if (mutable_db_options_.max_open_files == -1) {
+    uint64_t oldest_time = port::kMaxUint64;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      uint64_t ctime;
+      cfd->current()->GetCreationTimeOfOldestFile(&ctime);
+      if (ctime < oldest_time) {
+        oldest_time = ctime;
+      }
+      if (oldest_time == 0) {
+        break;
+      }
+    }
+    *creation_time = oldest_time;
+    return Status::OK();
+  } else {
+    return Status::NotSupported("This API only works if max_open_files = -1");
+  }
+}
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/db/db_impl.h b/db/db_impl/db_impl.h
similarity index 81%
rename from db/db_impl.h
rename to db/db_impl/db_impl.h
index 623f69ba6ef..fe97e08bec1 100644
--- a/db/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -20,26 +20,29 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/compaction_job.h"
+#include "db/compaction/compaction_job.h"
 #include "db/dbformat.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "db/external_sst_file_ingestion_job.h"
 #include "db/flush_job.h"
 #include "db/flush_scheduler.h"
+#include "db/import_column_family_job.h"
 #include "db/internal_stats.h"
 #include "db/log_writer.h"
 #include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
 #include "db/pre_release_callback.h"
 #include "db/range_del_aggregator.h"
 #include "db/read_callback.h"
 #include "db/snapshot_checker.h"
 #include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
 #include "db/version_edit.h"
 #include "db/wal_manager.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
-#include "memtable_list.h"
+#include "logging/event_logger.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/db_options.h"
 #include "port/port.h"
@@ -51,13 +54,13 @@
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "trace_replay/trace_replay.h"
 #include "util/autovector.h"
-#include "util/event_logger.h"
 #include "util/hash.h"
 #include "util/repeatable_thread.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
-#include "util/trace_replay.h"
 
 namespace rocksdb {
 
@@ -65,6 +68,7 @@ class Arena;
 class ArenaWrappedDBIter;
 class InMemoryStatsHistoryIterator;
 class MemTable;
+class PersistentStatsHistoryIterator;
 class TableCache;
 class TaskLimiterToken;
 class Version;
@@ -75,16 +79,66 @@ struct JobContext;
 struct ExternalSstFileInfo;
 struct MemTableInfo;
 
+// Class to maintain directories for all database paths other than main one.
+class Directories {
+ public:
+  Status SetDirectories(Env* env, const std::string& dbname,
+                        const std::string& wal_dir,
+                        const std::vector<DbPath>& data_paths);
+
+  Directory* GetDataDir(size_t path_id) const {
+    assert(path_id < data_dirs_.size());
+    Directory* ret_dir = data_dirs_[path_id].get();
+    if (ret_dir == nullptr) {
+      // Should use db_dir_
+      return db_dir_.get();
+    }
+    return ret_dir;
+  }
+
+  Directory* GetWalDir() {
+    if (wal_dir_) {
+      return wal_dir_.get();
+    }
+    return db_dir_.get();
+  }
+
+  Directory* GetDbDir() { return db_dir_.get(); }
+
+ private:
+  std::unique_ptr<Directory> db_dir_;
+  std::vector<std::unique_ptr<Directory>> data_dirs_;
+  std::unique_ptr<Directory> wal_dir_;
+};
+
+// While DB is the public interface of RocksDB, and DBImpl is the actual
+// class implementing it. It's the entrance of the core RocksdB engine.
+// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
+// DBImpl internally.
+// Other than functions implementing the DB interface, some public
+// functions are there for other internal components to call. For
+// example, TransactionDB directly calls DBImpl::WriteImpl() and
+// BlobDB directly calls DBImpl::GetImpl(). Some other functions
+// are for sub-components to call. For example, ColumnFamilyHandleImpl
+// calls DBImpl::FindObsoleteFiles().
+//
+// Since it's a very large class, the definition of the functions is
+// divided in several db_impl_*.cc files, besides db_impl.cc.
 class DBImpl : public DB {
  public:
   DBImpl(const DBOptions& options, const std::string& dbname,
          const bool seq_per_batch = false, const bool batch_per_txn = true);
+  // No copying allowed
+  DBImpl(const DBImpl&) = delete;
+  void operator=(const DBImpl&) = delete;
+
   virtual ~DBImpl();
 
+  // ---- Implementations of the DB interface ----
+
   using DB::Resume;
   virtual Status Resume() override;
 
-  // Implementations of the DB interface
   using DB::Put;
   virtual Status Put(const WriteOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
@@ -110,12 +164,20 @@ class DBImpl : public DB {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override;
 
-  // Function that Get and KeyMayExist call with no_io true or false
-  // Note: 'value_found' from KeyMayExist propagates here
-  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
-                 const Slice& key, PinnableSlice* value,
-                 bool* value_found = nullptr, ReadCallback* callback = nullptr,
-                 bool* is_blob_index = nullptr);
+  using DB::GetMergeOperands;
+  Status GetMergeOperands(const ReadOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          PinnableSlice* merge_operands,
+                          GetMergeOperandsOptions* get_merge_operands_options,
+                          int* number_of_operands) override {
+    GetImplOptions get_impl_options;
+    get_impl_options.column_family = column_family;
+    get_impl_options.merge_operands = merge_operands;
+    get_impl_options.get_merge_operands_options = get_merge_operands_options;
+    get_impl_options.number_of_operands = number_of_operands;
+    get_impl_options.get_value = false;
+    return GetImpl(options, key, get_impl_options);
+  }
 
   using DB::MultiGet;
   virtual std::vector<Status> MultiGet(
@@ -137,11 +199,15 @@ class DBImpl : public DB {
                         PinnableSlice* values, Status* statuses,
                         const bool sorted_input = false) override;
 
-  void MultiGetImpl(
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+
+  virtual void MultiGetWithCallback(
       const ReadOptions& options, ColumnFamilyHandle* column_family,
-      autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE>& key_context,
-      bool sorted_input, ReadCallback* callback = nullptr,
-      bool* is_blob_index = nullptr);
+      ReadCallback* callback,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
 
   virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                     const std::string& column_family,
@@ -174,12 +240,6 @@ class DBImpl : public DB {
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families,
       std::vector<Iterator*>* iterators) override;
-  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
-                                      ColumnFamilyData* cfd,
-                                      SequenceNumber snapshot,
-                                      ReadCallback* read_callback,
-                                      bool allow_blob = false,
-                                      bool allow_refresh = true);
 
   virtual const Snapshot* GetSnapshot() override;
   virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
@@ -197,9 +257,10 @@ class DBImpl : public DB {
   virtual bool GetAggregatedIntProperty(const Slice& property,
                                         uint64_t* aggregated_value) override;
   using DB::GetApproximateSizes;
-  virtual void GetApproximateSizes(
-      ColumnFamilyHandle* column_family, const Range* range, int n,
-      uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) override;
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* range, int n,
+                                     uint64_t* sizes) override;
   using DB::GetApproximateMemTableStats;
   virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
                                            const Range& range,
@@ -225,6 +286,9 @@ class DBImpl : public DB {
   virtual Status EnableAutoCompaction(
       const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
 
+  virtual void EnableManualCompaction() override;
+  virtual void DisableManualCompaction() override;
+
   using DB::SetOptions;
   Status SetOptions(
       ColumnFamilyHandle* column_family,
@@ -259,23 +323,23 @@ class DBImpl : public DB {
   virtual Status UnlockWAL() override;
 
   virtual SequenceNumber GetLatestSequenceNumber() const override;
-  virtual SequenceNumber GetLastPublishedSequence() const {
-    if (last_seq_same_as_publish_seq_) {
-      return versions_->LastSequence();
-    } else {
-      return versions_->LastPublishedSequence();
-    }
-  }
-  // REQUIRES: joined the main write queue if two_write_queues is disabled, and
-  // the second write queue otherwise.
-  virtual void SetLastPublishedSequence(SequenceNumber seq);
-  // Returns LastSequence in last_seq_same_as_publish_seq_
-  // mode and LastAllocatedSequence otherwise. This is useful when visiblility
-  // depends also on data written to the WAL but not to the memtable.
-  SequenceNumber TEST_GetLastVisibleSequence() const;
 
   virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;
 
+  virtual Status GetDbIdentity(std::string& identity) const override;
+
+  virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override;
+
+  ColumnFamilyHandle* PersistentStatsColumnFamily() const;
+
+  virtual Status Close() override;
+
+  Status GetStatsHistory(
+      uint64_t start_time, uint64_t end_time,
+      std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
+
 #ifndef ROCKSDB_LITE
   using DB::ResetStats;
   virtual Status ResetStats() override;
@@ -287,6 +351,10 @@ class DBImpl : public DB {
                               uint64_t* manifest_file_size,
                               bool flush_memtable = true) override;
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) override;
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* creation_time) override;
 
   virtual Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
@@ -313,6 +381,105 @@ class DBImpl : public DB {
   Status PromoteL0(ColumnFamilyHandle* column_family,
                    int target_level) override;
 
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& ingestion_options) override;
+
+  using DB::IngestExternalFiles;
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) override;
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) override;
+
+  using DB::VerifyChecksum;
+  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
+
+  using DB::StartTrace;
+  virtual Status StartTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndTrace;
+  virtual Status EndTrace() override;
+
+  using DB::StartBlockCacheTrace;
+  Status StartBlockCacheTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndBlockCacheTrace;
+  Status EndBlockCacheTrace() override;
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override;
+  virtual Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+      TablePropertiesCollection* props) override;
+
+#endif  // ROCKSDB_LITE
+
+  // ---- End of implementations of the DB interface ----
+
+  struct GetImplOptions {
+    ColumnFamilyHandle* column_family = nullptr;
+    PinnableSlice* value = nullptr;
+    bool* value_found = nullptr;
+    ReadCallback* callback = nullptr;
+    bool* is_blob_index = nullptr;
+    // If true return value associated with key via value pointer else return
+    // all merge operands for key via merge_operands pointer
+    bool get_value = true;
+    // Pointer to an array of size
+    // get_merge_operands_options.expected_max_number_of_operands allocated by
+    // user
+    PinnableSlice* merge_operands = nullptr;
+    GetMergeOperandsOptions* get_merge_operands_options = nullptr;
+    int* number_of_operands = nullptr;
+  };
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  // This function is also called by GetMergeOperands
+  // If get_impl_options.get_value = true get value associated with
+  // get_impl_options.key via get_impl_options.value
+  // If get_impl_options.get_value = false get merge operands associated with
+  // get_impl_options.key via get_impl_options.merge_operands
+  Status GetImpl(const ReadOptions& options, const Slice& key,
+                 GetImplOptions get_impl_options);
+
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      ReadCallback* read_callback,
+                                      bool allow_blob = false,
+                                      bool allow_refresh = true);
+
+  virtual SequenceNumber GetLastPublishedSequence() const {
+    if (last_seq_same_as_publish_seq_) {
+      return versions_->LastSequence();
+    } else {
+      return versions_->LastPublishedSequence();
+    }
+  }
+
+  // REQUIRES: joined the main write queue if two_write_queues is disabled, and
+  // the second write queue otherwise.
+  virtual void SetLastPublishedSequence(SequenceNumber seq);
+  // Returns LastSequence in last_seq_same_as_publish_seq_
+  // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+  // depends also on data written to the WAL but not to the memtable.
+  SequenceNumber TEST_GetLastVisibleSequence() const;
+
+#ifndef ROCKSDB_LITE
   // Similar to Write() but will call the callback once on the single write
   // thread to determine whether it is safe to perform the write.
   virtual Status WriteWithCallback(const WriteOptions& write_options,
@@ -352,33 +519,20 @@ class DBImpl : public DB {
   // snapshot, we know that no key could have existing after this snapshot
   // (since we do not compact keys that have an earlier snapshot).
   //
+  // Only records newer than or at `lower_bound_seq` are guaranteed to be
+  // returned. Memtables and files may not be checked if it only contains data
+  // older than `lower_bound_seq`.
+  //
   // Returns OK or NotFound on success,
   // other status on unexpected error.
   // TODO(andrewkr): this API need to be aware of range deletion operations
   Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
-                                 bool cache_only, SequenceNumber* seq,
+                                 bool cache_only,
+                                 SequenceNumber lower_bound_seq,
+                                 SequenceNumber* seq,
                                  bool* found_record_for_key,
                                  bool* is_blob_index = nullptr);
 
-  using DB::IngestExternalFile;
-  virtual Status IngestExternalFile(
-      ColumnFamilyHandle* column_family,
-      const std::vector<std::string>& external_files,
-      const IngestExternalFileOptions& ingestion_options) override;
-
-  using DB::IngestExternalFiles;
-  virtual Status IngestExternalFiles(
-      const std::vector<IngestExternalFileArg>& args) override;
-
-  virtual Status VerifyChecksum() override;
-
-  using DB::StartTrace;
-  virtual Status StartTrace(
-      const TraceOptions& options,
-      std::unique_ptr<TraceWriter>&& trace_writer) override;
-
-  using DB::EndTrace;
-  virtual Status EndTrace() override;
   Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
   Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
 #endif  // ROCKSDB_LITE
@@ -393,8 +547,6 @@ class DBImpl : public DB {
   // match to our in-memory records
   virtual Status CheckConsistency();
 
-  virtual Status GetDbIdentity(std::string& identity) const override;
-
   // max_file_num_to_ignore allows bottom level compaction to filter out newly
   // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
   // disable the filtering
@@ -416,102 +568,6 @@ class DBImpl : public DB {
     return &logs_with_prep_tracker_;
   }
 
-#ifndef NDEBUG
-  // Extra methods (for testing) that are not in the public DB interface
-  // Implemented in db_impl_debug.cc
-
-  // Compact any files in the named level that overlap [*begin, *end]
-  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
-                           ColumnFamilyHandle* column_family = nullptr,
-                           bool disallow_trivial_move = false);
-
-  void TEST_SwitchWAL();
-
-  bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
-
-  bool TEST_IsLogGettingFlushed() {
-    return alive_log_files_.begin()->getting_flushed;
-  }
-
-  Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
-
-  // Force current memtable contents to be flushed.
-  Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
-                            ColumnFamilyHandle* cfh = nullptr);
-
-  // Wait for memtable compaction
-  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
-
-  // Wait for any compaction
-  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
-  // is only for the special test of CancelledCompactions
-  Status TEST_WaitForCompact(bool waitUnscheduled = false);
-
-  // Return the maximum overlapping data (in bytes) at next level for any
-  // file at a level >= 1.
-  int64_t TEST_MaxNextLevelOverlappingBytes(
-      ColumnFamilyHandle* column_family = nullptr);
-
-  // Return the current manifest file no.
-  uint64_t TEST_Current_Manifest_FileNo();
-
-  // Returns the number that'll be assigned to the next file that's created.
-  uint64_t TEST_Current_Next_FileNo();
-
-  // get total level0 file size. Only for testing.
-  uint64_t TEST_GetLevel0TotalSize();
-
-  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
-                             std::vector<std::vector<FileMetaData>>* metadata);
-
-  void TEST_LockMutex();
-
-  void TEST_UnlockMutex();
-
-  // REQUIRES: mutex locked
-  void* TEST_BeginWrite();
-
-  // REQUIRES: mutex locked
-  // pass the pointer that you got from TEST_BeginWrite()
-  void TEST_EndWrite(void* w);
-
-  uint64_t TEST_MaxTotalInMemoryState() const {
-    return max_total_in_memory_state_;
-  }
-
-  size_t TEST_LogsToFreeSize();
-
-  uint64_t TEST_LogfileNumber();
-
-  uint64_t TEST_total_log_size() const { return total_log_size_; }
-
-  // Returns column family name to ImmutableCFOptions map.
-  Status TEST_GetAllImmutableCFOptions(
-      std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
-
-  // Return the lastest MutableCFOptions of a column family
-  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
-                                        MutableCFOptions* mutable_cf_options);
-
-  Cache* TEST_table_cache() { return table_cache_.get(); }
-
-  WriteController& TEST_write_controler() { return write_controller_; }
-
-  uint64_t TEST_FindMinLogContainingOutstandingPrep();
-  uint64_t TEST_FindMinPrepLogReferencedByMemTable();
-  size_t TEST_PreparedSectionCompletedSize();
-  size_t TEST_LogsWithPrepSize();
-
-  int TEST_BGCompactionsAllowed() const;
-  int TEST_BGFlushesAllowed() const;
-  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
-  void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
-  void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
-  bool TEST_IsPersistentStatsEnabled() const;
-  size_t TEST_EstiamteStatsHistorySize() const;
-
-#endif  // NDEBUG
-
   struct BGJobLimits {
     int max_flushes;
     int max_compactions;
@@ -555,12 +611,15 @@ class DBImpl : public DB {
   void PurgeObsoleteFiles(JobContext& background_contet,
                           bool schedule_only = false);
 
+  // Schedule a background job to actually delete obsolete files.
   void SchedulePurge();
 
-  ColumnFamilyHandle* DefaultColumnFamily() const override;
-
   const SnapshotList& snapshots() const { return snapshots_; }
 
+  // load list of snapshots to `snap_vector` that is no newer than `max_seq`
+  // in ascending order.
+  // `oldest_write_conflict_snapshot` is filled with the oldest snapshot
+  // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
   void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
                      SequenceNumber* oldest_write_conflict_snapshot,
                      const SequenceNumber& max_seq) const {
@@ -572,6 +631,10 @@ class DBImpl : public DB {
     return immutable_db_options_;
   }
 
+  // Cancel all background jobs, including flush, compaction, background
+  // purging, stats dumping threads, etc. If `wait` = true, wait for the
+  // running jobs to abort or finish before returning. Otherwise, only
+  // sends the signals.
   void CancelAllBackgroundWork(bool wait);
 
   // Find Super version and reference it. Based on options, it might return
@@ -725,7 +788,7 @@ class DBImpl : public DB {
 
   void DeleteAllRecoveredTransactions() {
     for (auto it = recovered_transactions_.begin();
-         it != recovered_transactions_.end(); it++) {
+         it != recovered_transactions_.end(); ++it) {
       delete it->second;
     }
     recovered_transactions_.clear();
@@ -748,6 +811,8 @@ class DBImpl : public DB {
 
   InstrumentedMutex* mutex() const { return &mutex_; }
 
+  // Initialize a brand new DB. The DB directory is expected to be empty before
+  // calling it.
   Status NewDB();
 
   // This is to be used only by internal rocksdb classes.
@@ -756,25 +821,129 @@ class DBImpl : public DB {
                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
                      const bool seq_per_batch, const bool batch_per_txn);
 
-  virtual Status Close() override;
 
   static Status CreateAndNewDirectory(Env* env, const std::string& dirname,
                                       std::unique_ptr<Directory>* directory);
 
-  // Given a time window, return an iterator for accessing stats history
-  Status GetStatsHistory(
-      uint64_t start_time, uint64_t end_time,
-      std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
-
   // find stats map from stats_history_ with smallest timestamp in
   // the range of [start_time, end_time)
   bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
                        uint64_t* new_time,
                        std::map<std::string, uint64_t>* stats_map);
 
+  // Print information of all tombstones of all iterators to the std::string
+  // This is only used by ldb. The output might be capped. Tombstones
+  // printed out are not guaranteed to be in any order.
+  Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                     int max_entries_to_print,
+                                     std::string* out_str);
+
+#ifndef NDEBUG
+  // Compact any files in the named level that overlap [*begin, *end]
+  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+                           ColumnFamilyHandle* column_family = nullptr,
+                           bool disallow_trivial_move = false);
+
+  void TEST_SwitchWAL();
+
+  bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
+
+  bool TEST_IsLogGettingFlushed() {
+    return alive_log_files_.begin()->getting_flushed;
+  }
+
+  Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
+
+  // Force current memtable contents to be flushed.
+  Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
+                            ColumnFamilyHandle* cfh = nullptr);
+
+  Status TEST_FlushMemTable(ColumnFamilyData* cfd,
+                            const FlushOptions& flush_opts);
+
+  // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
+  // is because in certain cases, we can flush column families, wait for the
+  // flush to complete, but delete the column family handle before the wait
+  // finishes. For example in CompactRange.
+  Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
+                                   const FlushOptions& flush_opts);
+
+  // Wait for memtable compaction
+  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+  // Wait for any compaction
+  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+  // is only for the special test of CancelledCompactions
+  Status TEST_WaitForCompact(bool waitUnscheduled = false);
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t TEST_MaxNextLevelOverlappingBytes(
+      ColumnFamilyHandle* column_family = nullptr);
+
+  // Return the current manifest file no.
+  uint64_t TEST_Current_Manifest_FileNo();
+
+  // Returns the number that'll be assigned to the next file that's created.
+  uint64_t TEST_Current_Next_FileNo();
+
+  // get total level0 file size. Only for testing.
+  uint64_t TEST_GetLevel0TotalSize();
+
+  void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family,
+                             std::vector<std::vector<FileMetaData>>* metadata);
+
+  void TEST_LockMutex();
+
+  void TEST_UnlockMutex();
+
+  // REQUIRES: mutex locked
+  void* TEST_BeginWrite();
+
+  // REQUIRES: mutex locked
+  // pass the pointer that you got from TEST_BeginWrite()
+  void TEST_EndWrite(void* w);
+
+  uint64_t TEST_MaxTotalInMemoryState() const {
+    return max_total_in_memory_state_;
+  }
+
+  size_t TEST_LogsToFreeSize();
+
+  uint64_t TEST_LogfileNumber();
+
+  uint64_t TEST_total_log_size() const { return total_log_size_; }
+
+  // Returns column family name to ImmutableCFOptions map.
+  Status TEST_GetAllImmutableCFOptions(
+      std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
+
+  // Return the lastest MutableCFOptions of a column family
+  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
+                                        MutableCFOptions* mutable_cf_options);
+
+  Cache* TEST_table_cache() { return table_cache_.get(); }
+
+  WriteController& TEST_write_controler() { return write_controller_; }
+
+  uint64_t TEST_FindMinLogContainingOutstandingPrep();
+  uint64_t TEST_FindMinPrepLogReferencedByMemTable();
+  size_t TEST_PreparedSectionCompletedSize();
+  size_t TEST_LogsWithPrepSize();
+
+  int TEST_BGCompactionsAllowed() const;
+  int TEST_BGFlushesAllowed() const;
+  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
+  void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
+  bool TEST_IsPersistentStatsEnabled() const;
+  size_t TEST_EstimateInMemoryStatsHistorySize() const;
+#endif  // NDEBUG
+
  protected:
   Env* const env_;
   const std::string dbname_;
+  std::string db_id_;
   std::unique_ptr<VersionSet> versions_;
   // Flag to check whether we allocated and own the info log file
   bool own_info_log_;
@@ -786,6 +955,7 @@ class DBImpl : public DB {
       recovered_transactions_;
   std::unique_ptr<Tracer> tracer_;
   InstrumentedMutex trace_mutex_;
+  BlockCacheTracer block_cache_tracer_;
 
   // State below is protected by mutex_
   // With two_write_queues enabled, some of the variables that accessed during
@@ -841,11 +1011,11 @@ class DBImpl : public DB {
 
   void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
                           const MutableCFOptions& mutable_cf_options,
-                          int job_id, TableProperties prop);
+                          int job_id);
 
-  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta,
-                              const MutableCFOptions& mutable_cf_options,
-                              int job_id, TableProperties prop);
+  void NotifyOnFlushCompleted(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
 
   void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
                                const Status& st,
@@ -897,14 +1067,32 @@ class DBImpl : public DB {
                             bool disable_memtable = false,
                             uint64_t* seq_used = nullptr);
 
-  // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates
-  // the number of sub-patches. A sub-patch is a subset of the write batch that
-  // does not have duplicate keys.
-  Status WriteImplWALOnly(const WriteOptions& options, WriteBatch* updates,
-                          WriteCallback* callback = nullptr,
-                          uint64_t* log_used = nullptr, uint64_t log_ref = 0,
-                          uint64_t* seq_used = nullptr, size_t batch_cnt = 0,
-                          PreReleaseCallback* pre_release_callback = nullptr);
+  // Write only to memtables without joining any write queue
+  Status UnorderedWriteMemtable(const WriteOptions& write_options,
+                                WriteBatch* my_batch, WriteCallback* callback,
+                                uint64_t log_ref, SequenceNumber seq,
+                                const size_t sub_batch_cnt);
+
+  // Whether the batch requires to be assigned with an order
+  enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
+  // Whether it requires publishing last sequence or not
+  enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
+
+  // Join the write_thread to write the batch only to the WAL. It is the
+  // responsibility of the caller to also write the write batch to the memtable
+  // if it required.
+  //
+  // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
+  // indicating the number of sub-batches in my_batch. A sub-patch is a subset
+  // of the write batch that does not have duplicate keys. When seq_per_batch is
+  // not set, each key is a separate sub_batch. Otherwise each duplicate key
+  // marks start of a new sub-batch.
+  Status WriteImplWALOnly(
+      WriteThread* write_thread, const WriteOptions& options,
+      WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
+      const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+      PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+      const PublishLastSeq publish_last_seq, const bool disable_memtable);
 
   // write cached_recoverable_state_ to memtable if it is not empty
   // The writer must be the leader in write_thread_ and holding mutex_
@@ -942,6 +1130,8 @@ class DBImpl : public DB {
   friend class DBTest_ConcurrentFlushWAL_Test;
   friend class DBTest_MixedSlowdownOptionsStop_Test;
   friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
+  friend class DBCompactionTest_CompactionDuringShutdown_Test;
+  friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
 #ifndef NDEBUG
   friend class DBTest2_ReadCallbackTest_Test;
   friend class WriteCallbackTest_WriteWithCallbackTest_Test;
@@ -949,7 +1139,10 @@ class DBImpl : public DB {
   friend class DBBlobIndexTest;
   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
 #endif
+
   struct CompactionState;
+  struct PrepickedCompaction;
+  struct PurgeFileInfo;
 
   struct WriteContext {
     SuperVersionContext superversion_context;
@@ -958,16 +1151,137 @@ class DBImpl : public DB {
     explicit WriteContext(bool create_superversion = false)
         : superversion_context(create_superversion) {}
 
-    ~WriteContext() {
-      superversion_context.Clean();
-      for (auto& m : memtables_to_free_) {
-        delete m;
-      }
-    }
+    ~WriteContext() {
+      superversion_context.Clean();
+      for (auto& m : memtables_to_free_) {
+        delete m;
+      }
+    }
+  };
+
+  struct LogFileNumberSize {
+    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+    void AddSize(uint64_t new_size) { size += new_size; }
+    uint64_t number;
+    uint64_t size = 0;
+    bool getting_flushed = false;
+  };
+
+  struct LogWriterNumber {
+    // pass ownership of _writer
+    LogWriterNumber(uint64_t _number, log::Writer* _writer)
+        : number(_number), writer(_writer) {}
+
+    log::Writer* ReleaseWriter() {
+      auto* w = writer;
+      writer = nullptr;
+      return w;
+    }
+    Status ClearWriter() {
+      Status s = writer->WriteBuffer();
+      delete writer;
+      writer = nullptr;
+      return s;
+    }
+
+    uint64_t number;
+    // Visual Studio doesn't support deque's member to be noncopyable because
+    // of a std::unique_ptr as a member.
+    log::Writer* writer;  // own
+    // true for some prefix of logs_
+    bool getting_synced = false;
+  };
+
+  // PurgeFileInfo is a structure to hold information of files to be deleted in
+  // purge_queue_
+  struct PurgeFileInfo {
+    std::string fname;
+    std::string dir_to_sync;
+    FileType type;
+    uint64_t number;
+    int job_id;
+    PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
+                  int jid)
+        : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
+  };
+
+  // Argument required by background flush thread.
+  struct BGFlushArg {
+    BGFlushArg()
+        : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+    BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
+               SuperVersionContext* superversion_context)
+        : cfd_(cfd),
+          max_memtable_id_(max_memtable_id),
+          superversion_context_(superversion_context) {}
+
+    // Column family to flush.
+    ColumnFamilyData* cfd_;
+    // Maximum ID of memtable to flush. In this column family, memtables with
+    // IDs smaller than this value must be flushed before this flush completes.
+    uint64_t max_memtable_id_;
+    // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+    // installs a new superversion for the column family. This operation
+    // requires a SuperVersionContext object (currently embedded in JobContext).
+    SuperVersionContext* superversion_context_;
+  };
+
+  // Argument passed to flush thread.
+  struct FlushThreadArg {
+    DBImpl* db_;
+
+    Env::Priority thread_pri_;
+  };
+
+  // Information for a manual compaction
+  struct ManualCompactionState {
+    ColumnFamilyData* cfd;
+    int input_level;
+    int output_level;
+    uint32_t output_path_id;
+    Status status;
+    bool done;
+    bool in_progress;            // compaction request being processed?
+    bool incomplete;             // only part of requested range compacted
+    bool exclusive;              // current behavior of only one manual
+    bool disallow_trivial_move;  // Force actual compaction to run
+    const InternalKey* begin;    // nullptr means beginning of key range
+    const InternalKey* end;      // nullptr means end of key range
+    InternalKey* manual_end;     // how far we are compacting
+    InternalKey tmp_storage;     // Used to keep track of compaction progress
+    InternalKey tmp_storage1;    // Used to keep track of compaction progress
+  };
+  struct PrepickedCompaction {
+    // background compaction takes ownership of `compaction`.
+    Compaction* compaction;
+    // caller retains ownership of `manual_compaction_state` as it is reused
+    // across background compactions.
+    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
+    // task limiter token is requested during compaction picking.
+    std::unique_ptr<TaskLimiterToken> task_token;
+  };
+
+  struct CompactionArg {
+    // caller retains ownership of `db`.
+    DBImpl* db;
+    // background compaction takes ownership of `prepicked_compaction`.
+    PrepickedCompaction* prepicked_compaction;
   };
 
-  struct PrepickedCompaction;
-  struct PurgeFileInfo;
+  // Initialize the built-in column family for persistent stats. Depending on
+  // whether on-disk persistent stats have been enabled before, it may either
+  // create a new column family and column family handle or just a column family
+  // handle.
+  // Required: DB mutex held
+  Status InitPersistStatsColumnFamily();
+
+  // Persistent Stats column family has two format version key which are used
+  // for compatibility check. Write format version if it's created for the
+  // first time, read format version and check compatibility if recovering
+  // from disk. This function requires DB mutex held at entrance but may
+  // release and re-acquire DB mutex in the process.
+  // Required: DB mutex held
+  Status PersistentStatsProcessFormatVersion();
 
   Status ResumeImpl();
 
@@ -1005,7 +1319,8 @@ class DBImpl : public DB {
   // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
   // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
   // and blocked by any other pending_outputs_ calls)
-  void ReleaseFileNumberFromPendingOutputs(std::list<uint64_t>::iterator v);
+  void ReleaseFileNumberFromPendingOutputs(
+      std::unique_ptr<std::list<uint64_t>::iterator>& v);
 
   Status SyncClosedLogs(JobContext* job_context);
 
@@ -1021,34 +1336,6 @@ class DBImpl : public DB {
       SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
       Env::Priority thread_pri);
 
-  // Argument required by background flush thread.
-  struct BGFlushArg {
-    BGFlushArg()
-        : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
-    BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
-               SuperVersionContext* superversion_context)
-        : cfd_(cfd),
-          max_memtable_id_(max_memtable_id),
-          superversion_context_(superversion_context) {}
-
-    // Column family to flush.
-    ColumnFamilyData* cfd_;
-    // Maximum ID of memtable to flush. In this column family, memtables with
-    // IDs smaller than this value must be flushed before this flush completes.
-    uint64_t max_memtable_id_;
-    // Pointer to a SuperVersionContext object. After flush completes, RocksDB
-    // installs a new superversion for the column family. This operation
-    // requires a SuperVersionContext object (currently embedded in JobContext).
-    SuperVersionContext* superversion_context_;
-  };
-
-  // Argument passed to flush thread.
-  struct FlushThreadArg {
-    DBImpl* db_;
-
-    Env::Priority thread_pri_;
-  };
-
   // Flush the memtables of (multiple) column families to multiple files on
   // persistent storage.
   Status FlushMemTablesToOutputFiles(
@@ -1060,8 +1347,8 @@ class DBImpl : public DB {
       JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
 
   // REQUIRES: log_numbers are sorted in ascending order
-  virtual Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
-                                 SequenceNumber* next_sequence, bool read_only);
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* next_sequence, bool read_only);
 
   // The following two methods are used to flush a memtable to
   // storage. The first one is used at database RecoveryTime (when the
@@ -1086,6 +1373,10 @@ class DBImpl : public DB {
 
   Status ScheduleFlushes(WriteContext* context);
 
+  void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
+
+  Status TrimMemtableHistory(WriteContext* context);
+
   Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
 
   void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
@@ -1121,6 +1412,32 @@ class DBImpl : public DB {
       const autovector<const uint64_t*>& flush_memtable_ids,
       bool resuming_from_bg_err);
 
+  inline void WaitForPendingWrites() {
+    mutex_.AssertHeld();
+    // In case of pipelined write is enabled, wait for all pending memtable
+    // writers.
+    if (immutable_db_options_.enable_pipelined_write) {
+      // Memtable writers may call DB::Get in case max_successive_merges > 0,
+      // which may lock mutex. Unlocking mutex here to avoid deadlock.
+      mutex_.Unlock();
+      write_thread_.WaitForMemTableWriters();
+      mutex_.Lock();
+    }
+
+    if (!immutable_db_options_.unordered_write) {
+      // Then the writes are finished before the next write group starts
+      return;
+    }
+
+    // Wait for the ones who already wrote to the WAL to finish their
+    // memtable write.
+    if (pending_memtable_writes_.load() != 0) {
+      std::unique_lock<std::mutex> guard(switch_mutex_);
+      switch_cv_.wait(guard,
+                      [&] { return pending_memtable_writes_.load() == 0; });
+    }
+  }
+
   // REQUIRES: mutex locked and in write thread.
   void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
 
@@ -1231,7 +1548,7 @@ class DBImpl : public DB {
 
   void PrintStatistics();
 
-  size_t EstiamteStatsHistorySize() const;
+  size_t EstimateInMemoryStatsHistorySize() const;
 
   // persist stats to column family "_persistent_stats"
   void PersistStats();
@@ -1273,6 +1590,135 @@ class DBImpl : public DB {
 
   void WaitForBackgroundWork();
 
+  // Background threads call this function, which is just a wrapper around
+  // the InstallSuperVersion() function. Background threads carry
+  // sv_context which can have new_superversion already
+  // allocated.
+  // All ColumnFamily state changes go through this function. Here we analyze
+  // the new state and we schedule background work if we detect that the new
+  // state needs flush or compaction.
+  void InstallSuperVersionAndScheduleWork(
+      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+      const MutableCFOptions& mutable_cf_options);
+
+  bool GetIntPropertyInternal(ColumnFamilyData* cfd,
+                              const DBPropertyInfo& property_info,
+                              bool is_locked, uint64_t* value);
+  bool GetPropertyHandleOptionsStatistics(std::string* value);
+
+  bool HasPendingManualCompaction();
+  bool HasExclusiveManualCompaction();
+  void AddManualCompaction(ManualCompactionState* m);
+  void RemoveManualCompaction(ManualCompactionState* m);
+  bool ShouldntRunManualCompaction(ManualCompactionState* m);
+  bool HaveManualCompaction(ColumnFamilyData* cfd);
+  bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
+#ifndef ROCKSDB_LITE
+  void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
+                              const Status& st,
+                              const CompactionJobStats& compaction_job_stats,
+                              const int job_id, const Version* current,
+                              CompactionJobInfo* compaction_job_info) const;
+  // Reserve the next 'num' file numbers for to-be-ingested external SST files,
+  // and return the current file_number in 'next_file_number'.
+  // Write a version edit to the MANIFEST.
+  Status ReserveFileNumbersBeforeIngestion(
+      ColumnFamilyData* cfd, uint64_t num,
+      std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+      uint64_t* next_file_number);
+#endif  //! ROCKSDB_LITE
+
+  bool ShouldPurge(uint64_t file_number) const;
+  void MarkAsGrabbedForPurge(uint64_t file_number);
+
+  size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
+
+  Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                   size_t preallocate_block_size, log::Writer** new_log);
+
+  // Validate self-consistency of DB options
+  static Status ValidateOptions(const DBOptions& db_options);
+  // Validate self-consistency of DB options and its consistency with cf options
+  static Status ValidateOptions(
+      const DBOptions& db_options,
+      const std::vector<ColumnFamilyDescriptor>& column_families);
+
+  // Utility function to do some debug validation and sort the given vector
+  // of MultiGet keys
+  void PrepareMultiGetKeys(
+      const size_t num_keys, bool sorted,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
+
+  // A structure to hold the information required to process MultiGet of keys
+  // belonging to one column family. For a multi column family MultiGet, there
+  // will be a container of these objects.
+  struct MultiGetColumnFamilyData {
+    ColumnFamilyHandle* cf;
+    ColumnFamilyData* cfd;
+
+    // For the batched MultiGet which relies on sorted keys, start specifies
+    // the index of first key belonging to this column family in the sorted
+    // list.
+    size_t start;
+
+    // For the batched MultiGet case, num_keys specifies the number of keys
+    // belonging to this column family in the sorted list
+    size_t num_keys;
+
+    // SuperVersion for the column family obtained in a manner that ensures a
+    // consistent view across all column families in the DB
+    SuperVersion* super_version;
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
+                             SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(0),
+          num_keys(0),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
+                             size_t count, SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(first),
+          num_keys(count),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData() = default;
+  };
+
+  // A common function to obtain a consistent snapshot, which can be implicit
+  // if the user doesn't specify a snapshot in read_options, across
+  // multiple column families for MultiGet. It will attempt to get an implicit
+  // snapshot without acquiring the db_mutes, but will give up after a few
+  // tries and acquire the mutex if a memtable flush happens. The template
+  // allows both the batched and non-batched MultiGet to call this with
+  // either an std::unordered_map or autovector of column families.
+  //
+  // If callback is non-null, the callback is refreshed with the snapshot
+  // sequence number
+  //
+  // A return value of true indicates that the SuperVersions were obtained
+  // from the ColumnFamilyData, whereas false indicates they are thread
+  // local
+  template <class T>
+  bool MultiCFSnapshot(
+      const ReadOptions& read_options, ReadCallback* callback,
+      std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+          iter_deref_func,
+      T* cf_list, SequenceNumber* snapshot);
+
+  // The actual implementation of the batching MultiGet. The caller is expected
+  // to have acquired the SuperVersion and pass in a snapshot sequence number
+  // in order to construct the LookupKeys. The start_key and num_keys specify
+  // the range of keys in the sorted_keys vector for a single column family.
+  void MultiGetImpl(
+      const ReadOptions& read_options, size_t start_key, size_t num_keys,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+      SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback,
+      bool* is_blob_index);
+
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
 
@@ -1285,9 +1731,12 @@ class DBImpl : public DB {
   // logfile_number_. With two_write_queues it also protects alive_log_files_,
   // and log_empty_. Refer to the definition of each variable below for more
   // details.
+  // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and
+  // mutex_, the order should be first mutex_ and then log_write_mutex_.
   InstrumentedMutex log_write_mutex_;
 
   std::atomic<bool> shutting_down_;
+  std::atomic<bool> manual_compaction_paused_;
   // This condition variable is signaled on these conditions:
   // * whenever bg_compaction_scheduled_ goes down to 0
   // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
@@ -1318,37 +1767,10 @@ class DBImpl : public DB {
   // expesnive mutex_ lock during WAL write, which update log_empty_.
   bool log_empty_;
 
-  struct LogFileNumberSize {
-    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
-    void AddSize(uint64_t new_size) { size += new_size; }
-    uint64_t number;
-    uint64_t size = 0;
-    bool getting_flushed = false;
-  };
-  struct LogWriterNumber {
-    // pass ownership of _writer
-    LogWriterNumber(uint64_t _number, log::Writer* _writer)
-        : number(_number), writer(_writer) {}
+  ColumnFamilyHandleImpl* persist_stats_cf_handle_;
 
-    log::Writer* ReleaseWriter() {
-      auto* w = writer;
-      writer = nullptr;
-      return w;
-    }
-    Status ClearWriter() {
-      Status s = writer->WriteBuffer();
-      delete writer;
-      writer = nullptr;
-      return s;
-    }
+  bool persistent_stats_cfd_exists_ = true;
 
-    uint64_t number;
-    // Visual Studio doesn't support deque's member to be noncopyable because
-    // of a std::unique_ptr as a member.
-    log::Writer* writer;  // own
-    // true for some prefix of logs_
-    bool getting_synced = false;
-  };
   // Without two_write_queues, read and writes to alive_log_files_ are
   // protected by mutex_. However since back() is never popped, and push_back()
   // is done only from write_thread_, the same thread can access the item
@@ -1395,30 +1817,6 @@ class DBImpl : public DB {
 
   bool stats_slice_initialized_ = false;
 
-  // Class to maintain directories for all database paths other than main one.
-  class Directories {
-   public:
-    Status SetDirectories(Env* env, const std::string& dbname,
-                          const std::string& wal_dir,
-                          const std::vector<DbPath>& data_paths);
-
-    Directory* GetDataDir(size_t path_id) const;
-
-    Directory* GetWalDir() {
-      if (wal_dir_) {
-        return wal_dir_.get();
-      }
-      return db_dir_.get();
-    }
-
-    Directory* GetDbDir() { return db_dir_.get(); }
-
-   private:
-    std::unique_ptr<Directory> db_dir_;
-    std::vector<std::unique_ptr<Directory>> data_dirs_;
-    std::unique_ptr<Directory> wal_dir_;
-  };
-
   Directories directories_;
 
   WriteBufferManager* write_buffer_manager_;
@@ -1441,6 +1839,8 @@ class DBImpl : public DB {
 
   FlushScheduler flush_scheduler_;
 
+  TrimHistoryScheduler trim_history_scheduler_;
+
   SnapshotList snapshots_;
 
   // For each background job, pending_outputs_ keeps the current file number at
@@ -1454,19 +1854,6 @@ class DBImpl : public DB {
   // State is protected with db mutex.
   std::list<uint64_t> pending_outputs_;
 
-  // PurgeFileInfo is a structure to hold information of files to be deleted in
-  // purge_queue_
-  struct PurgeFileInfo {
-    std::string fname;
-    std::string dir_to_sync;
-    FileType type;
-    uint64_t number;
-    int job_id;
-    PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
-                  int jid)
-        : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
-  };
-
   // flush_queue_ and compaction_queue_ hold column families that we need to
   // flush and compact, respectively.
   // A column family is inserted into flush_queue_ when it satisfies condition
@@ -1492,12 +1879,12 @@ class DBImpl : public DB {
   // ColumnFamilyData::pending_compaction_ == true)
   std::deque<ColumnFamilyData*> compaction_queue_;
 
-  // A queue to store filenames of the files to be purged
-  std::deque<PurgeFileInfo> purge_queue_;
+  // A map to store file numbers and filenames of the files to be purged
+  std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
 
   // A vector to store the file numbers that have been assigned to certain
   // JobContext. Current implementation tracks ssts only.
-  std::vector<uint64_t> files_grabbed_for_purge_;
+  std::unordered_set<uint64_t> files_grabbed_for_purge_;
 
   // A queue to store log writers to close
   std::deque<log::Writer*> logs_to_free_queue_;
@@ -1523,42 +1910,8 @@ class DBImpl : public DB {
   // number of background obsolete file purge jobs, submitted to the HIGH pool
   int bg_purge_scheduled_;
 
-  // Information for a manual compaction
-  struct ManualCompactionState {
-    ColumnFamilyData* cfd;
-    int input_level;
-    int output_level;
-    uint32_t output_path_id;
-    Status status;
-    bool done;
-    bool in_progress;            // compaction request being processed?
-    bool incomplete;             // only part of requested range compacted
-    bool exclusive;              // current behavior of only one manual
-    bool disallow_trivial_move;  // Force actual compaction to run
-    const InternalKey* begin;    // nullptr means beginning of key range
-    const InternalKey* end;      // nullptr means end of key range
-    InternalKey* manual_end;     // how far we are compacting
-    InternalKey tmp_storage;     // Used to keep track of compaction progress
-    InternalKey tmp_storage1;    // Used to keep track of compaction progress
-  };
-  struct PrepickedCompaction {
-    // background compaction takes ownership of `compaction`.
-    Compaction* compaction;
-    // caller retains ownership of `manual_compaction_state` as it is reused
-    // across background compactions.
-    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
-    // task limiter token is requested during compaction picking.
-    std::unique_ptr<TaskLimiterToken> task_token;
-  };
   std::deque<ManualCompactionState*> manual_compaction_dequeue_;
 
-  struct CompactionArg {
-    // caller retains ownership of `db`.
-    DBImpl* db;
-    // background compaction takes ownership of `prepicked_compaction`.
-    PrepickedCompaction* prepicked_compaction;
-  };
-
   // shall we disable deletion of obsolete files
   // if 0 the deletion is enabled.
   // if non-zero, files will not be getting deleted
@@ -1571,13 +1924,21 @@ class DBImpl : public DB {
   // corresponding call to PurgeObsoleteFiles has not yet finished.
   int pending_purge_obsolete_files_;
 
-  // last time when DeleteObsoleteFiles with full scan was executed. Originaly
+  // last time when DeleteObsoleteFiles with full scan was executed. Originally
   // initialized with startup time.
   uint64_t delete_obsolete_files_last_run_;
 
   // last time stats were dumped to LOG
   std::atomic<uint64_t> last_stats_dump_time_microsec_;
 
+  // The thread that wants to switch memtable, can wait on this cv until the
+  // pending writes to memtable finishes.
+  std::condition_variable switch_cv_;
+  // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
+  std::mutex switch_mutex_;
+  // Number of threads intending to write to memtable
+  std::atomic<size_t> pending_memtable_writes_ = {};
+
   // Each flush or compaction gets its own job id. this counter makes sure
   // they're unique
   std::atomic<int> next_job_id_;
@@ -1601,7 +1962,8 @@ class DBImpl : public DB {
 
   std::string db_absolute_path_;
 
-  // Number of running IngestExternalFile() calls.
+  // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
+  // calls.
   // REQUIRES: mutex held
   int num_running_ingest_file_;
 
@@ -1646,68 +2008,6 @@ class DBImpl : public DB {
   // REQUIRES: mutex locked
   std::unique_ptr<rocksdb::RepeatableThread> thread_persist_stats_;
 
-  // No copying allowed
-  DBImpl(const DBImpl&);
-  void operator=(const DBImpl&);
-
-  // Background threads call this function, which is just a wrapper around
-  // the InstallSuperVersion() function. Background threads carry
-  // sv_context which can have new_superversion already
-  // allocated.
-  // All ColumnFamily state changes go through this function. Here we analyze
-  // the new state and we schedule background work if we detect that the new
-  // state needs flush or compaction.
-  void InstallSuperVersionAndScheduleWork(
-      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
-      const MutableCFOptions& mutable_cf_options);
-
-#ifndef ROCKSDB_LITE
-  using DB::GetPropertiesOfAllTables;
-  virtual Status GetPropertiesOfAllTables(
-      ColumnFamilyHandle* column_family,
-      TablePropertiesCollection* props) override;
-  virtual Status GetPropertiesOfTablesInRange(
-      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
-      TablePropertiesCollection* props) override;
-
-#endif  // ROCKSDB_LITE
-
-  bool GetIntPropertyInternal(ColumnFamilyData* cfd,
-                              const DBPropertyInfo& property_info,
-                              bool is_locked, uint64_t* value);
-  bool GetPropertyHandleOptionsStatistics(std::string* value);
-
-  bool HasPendingManualCompaction();
-  bool HasExclusiveManualCompaction();
-  void AddManualCompaction(ManualCompactionState* m);
-  void RemoveManualCompaction(ManualCompactionState* m);
-  bool ShouldntRunManualCompaction(ManualCompactionState* m);
-  bool HaveManualCompaction(ColumnFamilyData* cfd);
-  bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
-#ifndef ROCKSDB_LITE
-  void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
-                              const Status& st,
-                              const CompactionJobStats& compaction_job_stats,
-                              const int job_id, const Version* current,
-                              CompactionJobInfo* compaction_job_info) const;
-  // Reserve the next 'num' file numbers for to-be-ingested external SST files,
-  // and return the current file_number in 'next_file_number'.
-  // Write a version edit to the MANIFEST.
-  Status ReserveFileNumbersBeforeIngestion(
-      ColumnFamilyData* cfd, uint64_t num,
-      std::list<uint64_t>::iterator* pending_output_elem,
-      uint64_t* next_file_number);
-#endif  //! ROCKSDB_LITE
-
-  bool ShouldPurge(uint64_t file_number) const;
-  void MarkAsGrabbedForPurge(uint64_t file_number);
-
-  size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
-  Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
-
-  Status CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
-                   size_t preallocate_block_size, log::Writer** new_log);
-
   // When set, we use a separate queue for writes that dont write to memtable.
   // In 2PC these are the writes at Prepare phase.
   const bool two_write_queues_;
@@ -1753,6 +2053,8 @@ class DBImpl : public DB {
   // results sequentially. Flush results of memtables with lower IDs get
   // installed to MANIFEST first.
   InstrumentedCondVar atomic_flush_install_cv_;
+
+  bool wal_in_db_path_;
 };
 
 extern Options SanitizeOptions(const std::string& db, const Options& src);
diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
similarity index 90%
rename from db/db_impl_compaction_flush.cc
rename to db/db_impl/db_impl_compaction_flush.cc
index f16c6111752..b01fdbc965c 100644
--- a/db/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -6,23 +6,21 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "db/builder.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
 #include "util/concurrent_task_limiter_impl.h"
-#include "util/sst_file_manager_impl.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -110,6 +108,13 @@ Status DBImpl::SyncClosedLogs(JobContext* job_context) {
       if (!s.ok()) {
         break;
       }
+
+      if (immutable_db_options_.recycle_log_file_num > 0) {
+        s = log->Close();
+        if (!s.ok()) {
+          break;
+        }
+      }
     }
     if (s.ok()) {
       s = directories_.GetWalDir()->Fsync();
@@ -159,8 +164,7 @@ Status DBImpl::FlushMemTableToOutputFile(
 
 #ifndef ROCKSDB_LITE
   // may temporarily unlock and lock the mutex.
-  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id,
-                     flush_job.GetTableProperties());
+  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
 #endif  // ROCKSDB_LITE
 
   Status s;
@@ -201,15 +205,15 @@ Status DBImpl::FlushMemTableToOutputFile(
                      cfd->current()->storage_info()->LevelSummary(&tmp));
   }
 
-  if (!s.ok() && !s.IsShutdownInProgress()) {
+  if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
     Status new_bg_error = s;
     error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
   }
   if (s.ok()) {
 #ifndef ROCKSDB_LITE
     // may temporarily unlock and lock the mutex.
-    NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options,
-                           job_context->job_id, flush_job.GetTableProperties());
+    NotifyOnFlushCompleted(cfd, mutable_cf_options,
+                           flush_job.GetCommittedFlushJobsInfo());
     auto sfm = static_cast<SstFileManagerImpl*>(
         immutable_db_options_.sst_file_manager.get());
     if (sfm) {
@@ -228,6 +232,7 @@ Status DBImpl::FlushMemTableToOutputFile(
     }
 #endif  // ROCKSDB_LITE
   }
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish");
   return s;
 }
 
@@ -254,7 +259,7 @@ Status DBImpl::FlushMemTablesToOutputFiles(
         snapshot_checker, log_buffer, thread_pri);
     if (!s.ok()) {
       status = s;
-      if (!s.IsShutdownInProgress()) {
+      if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
         // At this point, DB is not shutting down, nor is cfd dropped.
         // Something is wrong, thus we break out of the loop.
         break;
@@ -298,7 +303,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
   autovector<Directory*> distinct_output_dirs;
   autovector<std::string> distinct_output_dir_paths;
-  std::vector<FlushJob> jobs;
+  std::vector<std::unique_ptr<FlushJob>> jobs;
   std::vector<MutableCFOptions> all_mutable_cf_options;
   int num_cfs = static_cast<int>(cfds.size());
   all_mutable_cf_options.reserve(num_cfs);
@@ -325,7 +330,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
     const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
     const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
-    jobs.emplace_back(
+    jobs.emplace_back(new FlushJob(
         dbname_, cfd, immutable_db_options_, mutable_cf_options,
         max_memtable_id, env_options_for_compaction_, versions_.get(), &mutex_,
         &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
@@ -333,8 +338,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
         data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
         stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
         false /* sync_output_directory */, false /* write_manifest */,
-        thread_pri);
-    jobs.back().PickMemTable();
+        thread_pri));
+    jobs.back()->PickMemTable();
   }
 
   std::vector<FileMetaData> file_meta(num_cfs);
@@ -346,7 +351,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
     // may temporarily unlock and lock the mutex.
     NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
-                       job_context->job_id, jobs[i].GetTableProperties());
+                       job_context->job_id);
   }
 #endif /* !ROCKSDB_LITE */
 
@@ -368,7 +373,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     // TODO (yanqin): parallelize jobs with threads.
     for (int i = 1; i != num_cfs; ++i) {
       exec_status[i].second =
-          jobs[i].Run(&logs_with_prep_tracker_, &file_meta[i]);
+          jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i]);
       exec_status[i].first = true;
     }
     if (num_cfs > 1) {
@@ -377,15 +382,18 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
       TEST_SYNC_POINT(
           "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
     }
+    assert(exec_status.size() > 0);
+    assert(!file_meta.empty());
     exec_status[0].second =
-        jobs[0].Run(&logs_with_prep_tracker_, &file_meta[0]);
+        jobs[0]->Run(&logs_with_prep_tracker_, &file_meta[0]);
     exec_status[0].first = true;
 
     Status error_status;
     for (const auto& e : exec_status) {
       if (!e.second.ok()) {
         s = e.second;
-        if (!e.second.IsShutdownInProgress()) {
+        if (!e.second.IsShutdownInProgress() &&
+            !e.second.IsColumnFamilyDropped()) {
           // If a flush job did not return OK, and the CF is not dropped, and
           // the DB is not shutting down, then we have to return this result to
           // caller later.
@@ -397,15 +405,11 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     s = error_status.ok() ? s : error_status;
   }
 
-  // If db is NOT shutting down, and one or more column families have been
-  // dropped.
-  // TODO: use separate status code for db shutdown and column family dropped.
-  if (s.IsShutdownInProgress() &&
-      !shutting_down_.load(std::memory_order_acquire)) {
+  if (s.IsColumnFamilyDropped()) {
     s = Status::OK();
   }
 
-  if (s.ok() || s.IsShutdownInProgress()) {
+  if (s.ok() || s.IsShutdownInProgress() || s.IsColumnFamilyDropped()) {
     // Sync on all distinct output directories.
     for (auto dir : distinct_output_dirs) {
       if (dir != nullptr) {
@@ -422,7 +426,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     auto wait_to_install_func = [&]() {
       bool ready = true;
       for (size_t i = 0; i != cfds.size(); ++i) {
-        const auto& mems = jobs[i].GetMemTables();
+        const auto& mems = jobs[i]->GetMemTables();
         if (cfds[i]->IsDropped()) {
           // If the column family is dropped, then do not wait.
           continue;
@@ -463,7 +467,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     autovector<const MutableCFOptions*> mutable_cf_options_list;
     autovector<FileMetaData*> tmp_file_meta;
     for (int i = 0; i != num_cfs; ++i) {
-      const auto& mems = jobs[i].GetMemTables();
+      const auto& mems = jobs[i]->GetMemTables();
       if (!cfds[i]->IsDropped() && !mems.empty()) {
         tmp_cfds.emplace_back(cfds[i]);
         mems_list.emplace_back(&mems);
@@ -499,12 +503,13 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 #ifndef ROCKSDB_LITE
     auto sfm = static_cast<SstFileManagerImpl*>(
         immutable_db_options_.sst_file_manager.get());
+    assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
     for (int i = 0; i != num_cfs; ++i) {
       if (cfds[i]->IsDropped()) {
         continue;
       }
-      NotifyOnFlushCompleted(cfds[i], &file_meta[i], all_mutable_cf_options[i],
-                             job_context->job_id, jobs[i].GetTableProperties());
+      NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i],
+                             jobs[i]->GetCommittedFlushJobsInfo());
       if (sfm) {
         std::string file_path = MakeTableFileName(
             cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
@@ -523,17 +528,17 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
   // Need to undo atomic flush if something went wrong, i.e. s is not OK and
   // it is not because of CF drop.
-  if (!s.ok() && !s.IsShutdownInProgress()) {
+  if (!s.ok() && !s.IsColumnFamilyDropped()) {
     // Have to cancel the flush jobs that have NOT executed because we need to
     // unref the versions.
     for (int i = 0; i != num_cfs; ++i) {
       if (!exec_status[i].first) {
-        jobs[i].Cancel();
+        jobs[i]->Cancel();
       }
     }
     for (int i = 0; i != num_cfs; ++i) {
       if (exec_status[i].first && exec_status[i].second.ok()) {
-        auto& mems = jobs[i].GetMemTables();
+        auto& mems = jobs[i]->GetMemTables();
         cfds[i]->imm()->RollbackMemtableFlush(mems,
                                               file_meta[i].fd.GetNumber());
       }
@@ -547,7 +552,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
 void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
                                 const MutableCFOptions& mutable_cf_options,
-                                int job_id, TableProperties prop) {
+                                int job_id) {
 #ifndef ROCKSDB_LITE
   if (immutable_db_options_.listeners.size() == 0U) {
     return;
@@ -565,20 +570,21 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
   // release lock while notifying events
   mutex_.Unlock();
   {
-    FlushJobInfo info;
+    FlushJobInfo info{};
     info.cf_id = cfd->GetID();
     info.cf_name = cfd->GetName();
     // TODO(yhchiang): make db_paths dynamic in case flush does not
     //                 go to L0 in the future.
-    info.file_path = MakeTableFileName(cfd->ioptions()->cf_paths[0].path,
-                                       file_meta->fd.GetNumber());
+    const uint64_t file_number = file_meta->fd.GetNumber();
+    info.file_path =
+        MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number);
+    info.file_number = file_number;
     info.thread_id = env_->GetThreadID();
     info.job_id = job_id;
     info.triggered_writes_slowdown = triggered_writes_slowdown;
     info.triggered_writes_stop = triggered_writes_stop;
     info.smallest_seqno = file_meta->fd.smallest_seqno;
     info.largest_seqno = file_meta->fd.largest_seqno;
-    info.table_properties = prop;
     info.flush_reason = cfd->GetFlushReason();
     for (auto listener : immutable_db_options_.listeners) {
       listener->OnFlushBegin(this, info);
@@ -592,15 +598,14 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
   (void)file_meta;
   (void)mutable_cf_options;
   (void)job_id;
-  (void)prop;
 #endif  // ROCKSDB_LITE
 }
 
-void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd,
-                                    FileMetaData* file_meta,
-                                    const MutableCFOptions& mutable_cf_options,
-                                    int job_id, TableProperties prop) {
+void DBImpl::NotifyOnFlushCompleted(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info) {
 #ifndef ROCKSDB_LITE
+  assert(flush_jobs_info != nullptr);
   if (immutable_db_options_.listeners.size() == 0U) {
     return;
   }
@@ -617,34 +622,22 @@ void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd,
   // release lock while notifying events
   mutex_.Unlock();
   {
-    FlushJobInfo info;
-    info.cf_id = cfd->GetID();
-    info.cf_name = cfd->GetName();
-    // TODO(yhchiang): make db_paths dynamic in case flush does not
-    //                 go to L0 in the future.
-    info.file_path = MakeTableFileName(cfd->ioptions()->cf_paths[0].path,
-                                       file_meta->fd.GetNumber());
-    info.thread_id = env_->GetThreadID();
-    info.job_id = job_id;
-    info.triggered_writes_slowdown = triggered_writes_slowdown;
-    info.triggered_writes_stop = triggered_writes_stop;
-    info.smallest_seqno = file_meta->fd.smallest_seqno;
-    info.largest_seqno = file_meta->fd.largest_seqno;
-    info.table_properties = prop;
-    info.flush_reason = cfd->GetFlushReason();
-    for (auto listener : immutable_db_options_.listeners) {
-      listener->OnFlushCompleted(this, info);
+    for (auto& info : *flush_jobs_info) {
+      info->triggered_writes_slowdown = triggered_writes_slowdown;
+      info->triggered_writes_stop = triggered_writes_stop;
+      for (auto listener : immutable_db_options_.listeners) {
+        listener->OnFlushCompleted(this, *info);
+      }
     }
+    flush_jobs_info->clear();
   }
   mutex_.Lock();
   // no need to signal bg_cv_ as it will be signaled at the end of the
   // flush process.
 #else
   (void)cfd;
-  (void)file_meta;
   (void)mutable_cf_options;
-  (void)job_id;
-  (void)prop;
+  (void)flush_jobs_info;
 #endif  // ROCKSDB_LITE
 }
 
@@ -798,29 +791,6 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
   return s;
 }
 
-class SnapshotListFetchCallbackImpl : public SnapshotListFetchCallback {
- public:
-  SnapshotListFetchCallbackImpl(DBImpl* db_impl, Env* env,
-                                uint64_t snap_refresh_nanos, Logger* info_log)
-      : SnapshotListFetchCallback(env, snap_refresh_nanos),
-        db_impl_(db_impl),
-        info_log_(info_log) {}
-  virtual void Refresh(std::vector<SequenceNumber>* snapshots,
-                       SequenceNumber max) override {
-    size_t prev = snapshots->size();
-    snapshots->clear();
-    db_impl_->LoadSnapshots(snapshots, nullptr, max);
-    size_t now = snapshots->size();
-    ROCKS_LOG_DEBUG(info_log_,
-                    "Compaction snapshot count refreshed from %zu to %zu", prev,
-                    now);
-  }
-
- private:
-  DBImpl* db_impl_;
-  Logger* info_log_;
-};
-
 Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
                             ColumnFamilyHandle* column_family,
                             const std::vector<std::string>& input_file_names,
@@ -913,6 +883,9 @@ Status DBImpl::CompactFilesImpl(
   if (shutting_down_.load(std::memory_order_acquire)) {
     return Status::ShutdownInProgress();
   }
+  if (manual_compaction_paused_.load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
 
   std::unordered_set<uint64_t> input_set;
   for (const auto& file_name : input_file_names) {
@@ -987,14 +960,12 @@ Status DBImpl::CompactFilesImpl(
   GetSnapshotContext(job_context, &snapshot_seqs,
                      &earliest_write_conflict_snapshot, &snapshot_checker);
 
-  auto pending_outputs_inserted_elem =
-      CaptureCurrentFileNumberInPendingOutputs();
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
 
   assert(is_snapshot_supported_ || snapshots_.empty());
   CompactionJobStats compaction_job_stats;
-  SnapshotListFetchCallbackImpl fetch_callback(
-      this, env_, c->mutable_cf_options()->snap_refresh_nanos,
-      immutable_db_options_.info_log.get());
   CompactionJob compaction_job(
       job_context->job_id, c.get(), immutable_db_options_,
       env_options_for_compaction_, versions_.get(), &shutting_down_,
@@ -1004,9 +975,7 @@ Status DBImpl::CompactFilesImpl(
       snapshot_checker, table_cache_, &event_logger_,
       c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
-      &compaction_job_stats, Env::Priority::USER,
-      immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback
-                                                    : nullptr);
+      &compaction_job_stats, Env::Priority::USER, &manual_compaction_paused_);
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
@@ -1050,8 +1019,14 @@ Status DBImpl::CompactFilesImpl(
 
   if (status.ok()) {
     // Done
-  } else if (status.IsShutdownInProgress()) {
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
+  } else if (status.IsManualCompactionPaused()) {
+    // Don't report stopping manual compaction as error
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[%s] [JOB %d] Stopping manual compaction",
+                   c->column_family_data()->GetName().c_str(),
+                   job_context->job_id);
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "[%s] [JOB %d] Compaction error: %s",
@@ -1122,13 +1097,17 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
+  if (c->is_manual_compaction() &&
+      manual_compaction_paused_.load(std::memory_order_acquire)) {
+    return;
+  }
   Version* current = cfd->current();
   current->Ref();
   // release lock while notifying events
   mutex_.Unlock();
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
   {
-    CompactionJobInfo info;
+    CompactionJobInfo info{};
     info.cf_name = cfd->GetName();
     info.status = st;
     info.thread_id = env_->GetThreadID();
@@ -1141,9 +1120,13 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
     info.compression = c->output_compression();
     for (size_t i = 0; i < c->num_input_levels(); ++i) {
       for (const auto fmd : *c->inputs(i)) {
+        const FileDescriptor& desc = fmd->fd;
+        const uint64_t file_number = desc.GetNumber();
         auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
-                                fmd->fd.GetNumber(), fmd->fd.GetPathId());
+                                file_number, desc.GetPathId());
         info.input_files.push_back(fn);
+        info.input_file_infos.push_back(CompactionFileInfo{
+            static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
         if (info.table_properties.count(fn) == 0) {
           std::shared_ptr<const TableProperties> tp;
           auto s = current->GetTableProperties(&tp, fmd, &fn);
@@ -1154,9 +1137,13 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
       }
     }
     for (const auto newf : c->edit()->GetNewFiles()) {
+      const FileMetaData& meta = newf.second;
+      const FileDescriptor& desc = meta.fd;
+      const uint64_t file_number = desc.GetNumber();
       info.output_files.push_back(TableFileName(
-          c->immutable_cf_options()->cf_paths, newf.second.fd.GetNumber(),
-          newf.second.fd.GetPathId()));
+          c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+      info.output_file_infos.push_back(CompactionFileInfo{
+          newf.first, file_number, meta.oldest_blob_file_number});
     }
     for (auto listener : immutable_db_options_.listeners) {
       listener->OnCompactionBegin(this, info);
@@ -1184,13 +1171,17 @@ void DBImpl::NotifyOnCompactionCompleted(
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
+  if (c->is_manual_compaction() &&
+      manual_compaction_paused_.load(std::memory_order_acquire)) {
+    return;
+  }
   Version* current = cfd->current();
   current->Ref();
   // release lock while notifying events
   mutex_.Unlock();
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
   {
-    CompactionJobInfo info;
+    CompactionJobInfo info{};
     BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
                            &info);
     for (auto listener : immutable_db_options_.listeners) {
@@ -1265,7 +1256,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->fd.smallest_seqno, f->fd.largest_seqno,
-                   f->marked_for_compaction);
+                   f->marked_for_compaction, f->oldest_blob_file_number,
+                   f->oldest_ancester_time, f->file_creation_time);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -1541,20 +1533,51 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     InstrumentedMutexLock guard_lock(&mutex_);
 
     WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
     if (!writes_stopped) {
       write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
     }
 
     if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
       s = SwitchMemtable(cfd, &context);
     }
-
     if (s.ok()) {
       if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
           !cached_recoverable_state_empty_.load()) {
         flush_memtable_id = cfd->imm()->GetLatestMemTableID();
         flush_req.emplace_back(cfd, flush_memtable_id);
       }
+      if (immutable_db_options_.persist_stats_to_disk) {
+        ColumnFamilyData* cfd_stats =
+            versions_->GetColumnFamilySet()->GetColumnFamily(
+                kPersistentStatsColumnFamilyName);
+        if (cfd_stats != nullptr && cfd_stats != cfd &&
+            !cfd_stats->mem()->IsEmpty()) {
+          // only force flush stats CF when it will be the only CF lagging
+          // behind after the current flush
+          bool stats_cf_flush_needed = true;
+          for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+            if (loop_cfd == cfd_stats || loop_cfd == cfd) {
+              continue;
+            }
+            if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+              stats_cf_flush_needed = false;
+            }
+          }
+          if (stats_cf_flush_needed) {
+            ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                           "Force flushing stats CF with manual flush of %s "
+                           "to avoid holding old logs",
+                           cfd->GetName().c_str());
+            s = SwitchMemtable(cfd_stats, &context);
+            flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID();
+            flush_req.emplace_back(cfd_stats, flush_memtable_id);
+          }
+        }
+      }
     }
 
     if (s.ok() && !flush_req.empty()) {
@@ -1562,15 +1585,29 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
         ColumnFamilyData* loop_cfd = elem.first;
         loop_cfd->imm()->FlushRequested();
       }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (auto& elem : flush_req) {
+          ColumnFamilyData* loop_cfd = elem.first;
+          loop_cfd->Ref();
+        }
+      }
       SchedulePendingFlush(flush_req, flush_reason);
       MaybeScheduleFlushOrCompaction();
     }
 
     if (!writes_stopped) {
       write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
     }
   }
-
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush");
   if (s.ok() && flush_options.wait) {
     autovector<ColumnFamilyData*> cfds;
     autovector<const uint64_t*> flush_memtable_ids;
@@ -1580,6 +1617,13 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     }
     s = WaitForFlushMemTables(cfds, flush_memtable_ids,
                               (flush_reason == FlushReason::kErrorRecovery));
+    for (auto* tmp_cfd : cfds) {
+      if (tmp_cfd->Unref()) {
+        // Only one thread can reach here.
+        InstrumentedMutexLock lock_guard(&mutex_);
+        delete tmp_cfd;
+      }
+    }
   }
   TEST_SYNC_POINT("FlushMemTableFinished");
   return s;
@@ -1614,8 +1658,12 @@ Status DBImpl::AtomicFlushMemTables(
     InstrumentedMutexLock guard_lock(&mutex_);
 
     WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
     if (!writes_stopped) {
       write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
     }
 
     for (auto cfd : column_family_datas) {
@@ -1643,6 +1691,15 @@ Status DBImpl::AtomicFlushMemTables(
       for (auto cfd : cfds) {
         cfd->imm()->FlushRequested();
       }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (auto cfd : cfds) {
+          cfd->Ref();
+        }
+      }
       GenerateFlushRequest(cfds, &flush_req);
       SchedulePendingFlush(flush_req, flush_reason);
       MaybeScheduleFlushOrCompaction();
@@ -1650,10 +1707,13 @@ Status DBImpl::AtomicFlushMemTables(
 
     if (!writes_stopped) {
       write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
     }
   }
   TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
-
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush");
   if (s.ok() && flush_options.wait) {
     autovector<const uint64_t*> flush_memtable_ids;
     for (auto& iter : flush_req) {
@@ -1661,6 +1721,13 @@ Status DBImpl::AtomicFlushMemTables(
     }
     s = WaitForFlushMemTables(cfds, flush_memtable_ids,
                               (flush_reason == FlushReason::kErrorRecovery));
+    for (auto* cfd : cfds) {
+      if (cfd->Unref()) {
+        // Only one thread can reach here.
+        InstrumentedMutexLock lock_guard(&mutex_);
+        delete cfd;
+      }
+    }
   }
   return s;
 }
@@ -1695,7 +1762,10 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
                        cfd->GetName().c_str());
         bg_cv_.Wait();
       }
-      if (cfd->IsDropped() || shutting_down_.load(std::memory_order_acquire)) {
+      if (cfd->IsDropped()) {
+        return Status::ColumnFamilyDropped();
+      }
+      if (shutting_down_.load(std::memory_order_acquire)) {
         return Status::ShutdownInProgress();
       }
 
@@ -1810,6 +1880,14 @@ Status DBImpl::EnableAutoCompaction(
   return s;
 }
 
+void DBImpl::DisableManualCompaction() {
+  manual_compaction_paused_.store(true, std::memory_order_release);
+}
+
+void DBImpl::EnableManualCompaction() {
+  manual_compaction_paused_.store(false, std::memory_order_release);
+}
+
 void DBImpl::MaybeScheduleFlushOrCompaction() {
   mutex_.AssertHeld();
   if (!opened_successfully_) {
@@ -1995,7 +2073,7 @@ void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
                                   FileType type, uint64_t number, int job_id) {
   mutex_.AssertHeld();
   PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
-  purge_queue_.push_back(std::move(file_info));
+  purge_files_.insert({{number, std::move(file_info)}});
 }
 
 void DBImpl::BGWorkFlush(void* arg) {
@@ -2004,7 +2082,8 @@ void DBImpl::BGWorkFlush(void* arg) {
 
   IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
   TEST_SYNC_POINT("DBImpl::BGWorkFlush");
-  reinterpret_cast<DBImpl*>(fta.db_)->BackgroundCallFlush(fta.thread_pri_);
+  static_cast_with_check<DBImpl, DB>(fta.db_)->BackgroundCallFlush(
+      fta.thread_pri_);
   TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
 }
 
@@ -2015,7 +2094,7 @@ void DBImpl::BGWorkCompaction(void* arg) {
   TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
   auto prepicked_compaction =
       static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
-  reinterpret_cast<DBImpl*>(ca.db)->BackgroundCallCompaction(
+  static_cast_with_check<DBImpl, DB>(ca.db)->BackgroundCallCompaction(
       prepicked_compaction, Env::Priority::LOW);
   delete prepicked_compaction;
 }
@@ -2080,6 +2159,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
   autovector<BGFlushArg> bg_flush_args;
   std::vector<SuperVersionContext>& superversion_contexts =
       job_context->superversion_contexts;
+  autovector<ColumnFamilyData*> column_families_not_to_flush;
   while (!flush_queue_.empty()) {
     // This cfd is already referenced
     const FlushRequest& flush_req = PopFirstFromFlushQueue();
@@ -2090,9 +2170,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
       ColumnFamilyData* cfd = iter.first;
       if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
         // can't flush this CF, try next one
-        if (cfd->Unref()) {
-          delete cfd;
-        }
+        column_families_not_to_flush.push_back(cfd);
         continue;
       }
       superversion_contexts.emplace_back(SuperVersionContext(true));
@@ -2120,6 +2198,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
     }
     status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
                                          job_context, log_buffer, thread_pri);
+    TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush");
     // All the CFDs in the FlushReq must have the same flush reason, so just
     // grab the first one
     *reason = bg_flush_args[0].cfd_->GetFlushReason();
@@ -2131,6 +2210,11 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
       }
     }
   }
+  for (auto cfd : column_families_not_to_flush) {
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
   return status;
 }
 
@@ -2147,13 +2231,14 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
     assert(bg_flush_scheduled_);
     num_running_flushes_++;
 
-    auto pending_outputs_inserted_elem =
-        CaptureCurrentFileNumberInPendingOutputs();
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
     FlushReason reason;
 
     Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
                                &reason, thread_pri);
-    if (!s.ok() && !s.IsShutdownInProgress() &&
+    if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
         reason != FlushReason::kErrorRecovery) {
       // Wait a little bit before retrying background flush in
       // case this is an environmental problem and we do not want to
@@ -2178,7 +2263,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
 
     // If flush failed, we want to delete all temporary files that we might have
     // created. Thus, we force full scan in FindObsoleteFiles()
-    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsColumnFamilyDropped());
     // delete unnecessary files if any, this is done outside the mutex
     if (job_context.HaveSomethingToClean() ||
         job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
@@ -2228,8 +2314,9 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
 
     num_running_compactions_++;
 
-    auto pending_outputs_inserted_elem =
-        CaptureCurrentFileNumberInPendingOutputs();
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
 
     assert((bg_thread_pri == Env::Priority::BOTTOM &&
             bg_bottom_compaction_scheduled_) ||
@@ -2242,7 +2329,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       mutex_.Unlock();
       env_->SleepForMicroseconds(10000);  // prevent hot loop
       mutex_.Lock();
-    } else if (!s.ok() && !s.IsShutdownInProgress()) {
+    } else if (!s.ok() && !s.IsShutdownInProgress() &&
+               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
       // Wait a little bit before retrying background compaction in
       // case this is an environmental problem and we do not want to
       // chew up resources for failed compactions for the duration of
@@ -2259,6 +2347,11 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       LogFlush(immutable_db_options_.info_log);
       env_->SleepForMicroseconds(1000000);
       mutex_.Lock();
+    } else if (s.IsManualCompactionPaused()) {
+      ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
+      assert(m);
+      ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
+                       m->cfd->GetName().c_str(), job_context.job_id);
     }
 
     ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
@@ -2266,7 +2359,9 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     // If compaction failed, we want to delete all temporary files that we might
     // have created (they might not be all recorded in job_context in case of a
     // failure). Thus, we force full scan in FindObsoleteFiles()
-    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsManualCompactionPaused() &&
+                                        !s.IsColumnFamilyDropped());
     TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
 
     // delete unnecessary files if any, this is done outside the mutex
@@ -2349,6 +2444,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   if (!error_handler_.IsBGWorkStopped()) {
     if (shutting_down_.load(std::memory_order_acquire)) {
       status = Status::ShutdownInProgress();
+    } else if (is_manual &&
+               manual_compaction_paused_.load(std::memory_order_acquire)) {
+      status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
     }
   } else {
     status = error_handler_.GetBGError();
@@ -2573,7 +2671,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
                            f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
                            f->largest, f->fd.smallest_seqno,
-                           f->fd.largest_seqno, f->marked_for_compaction);
+                           f->fd.largest_seqno, f->marked_for_compaction,
+                           f->oldest_blob_file_number, f->oldest_ancester_time,
+                           f->file_creation_time);
 
         ROCKS_LOG_BUFFER(
             log_buffer,
@@ -2650,9 +2750,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     GetSnapshotContext(job_context, &snapshot_seqs,
                        &earliest_write_conflict_snapshot, &snapshot_checker);
     assert(is_snapshot_supported_ || snapshots_.empty());
-    SnapshotListFetchCallbackImpl fetch_callback(
-        this, env_, c->mutable_cf_options()->snap_refresh_nanos,
-        immutable_db_options_.info_log.get());
     CompactionJob compaction_job(
         job_context->job_id, c.get(), immutable_db_options_,
         env_options_for_compaction_, versions_.get(), &shutting_down_,
@@ -2663,14 +2760,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
         c->mutable_cf_options()->report_bg_io_stats, dbname_,
         &compaction_job_stats, thread_pri,
-        immutable_db_options_.max_subcompactions <= 1 ? &fetch_callback
-                                                      : nullptr);
+        is_manual ? &manual_compaction_paused_ : nullptr);
     compaction_job.Prepare();
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                             compaction_job_stats, job_context->job_id);
 
     mutex_.Unlock();
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
     compaction_job.Run();
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
     mutex_.Lock();
@@ -2702,9 +2800,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                                 compaction_job_stats, job_context->job_id);
   }
 
-  if (status.ok() || status.IsCompactionTooLarge()) {
+  if (status.ok() || status.IsCompactionTooLarge() ||
+      status.IsManualCompactionPaused()) {
     // Done
-  } else if (status.IsShutdownInProgress()) {
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
@@ -2788,7 +2887,7 @@ void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) {
       it = manual_compaction_dequeue_.erase(it);
       return;
     }
-    it++;
+    ++it;
   }
   assert(false);
   return;
@@ -2809,7 +2908,7 @@ bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
   bool seen = false;
   while (it != manual_compaction_dequeue_.end()) {
     if (m == (*it)) {
-      it++;
+      ++it;
       seen = true;
       continue;
     } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) {
@@ -2818,7 +2917,7 @@ bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
       // and (*it) is ahead in the queue and is not yet in progress
       return true;
     }
-    it++;
+    ++it;
   }
   return false;
 }
@@ -2836,7 +2935,7 @@ bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
       // in progress
       return true;
     }
-    it++;
+    ++it;
   }
   return false;
 }
@@ -2849,7 +2948,7 @@ bool DBImpl::HasExclusiveManualCompaction() {
     if ((*it)->exclusive) {
       return true;
     }
-    it++;
+    ++it;
   }
   return false;
 }
@@ -2883,9 +2982,13 @@ void DBImpl::BuildCompactionJobInfo(
   compaction_job_info->compression = c->output_compression();
   for (size_t i = 0; i < c->num_input_levels(); ++i) {
     for (const auto fmd : *c->inputs(i)) {
-      auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
-                              fmd->fd.GetNumber(), fmd->fd.GetPathId());
+      const FileDescriptor& desc = fmd->fd;
+      const uint64_t file_number = desc.GetNumber();
+      auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number,
+                              desc.GetPathId());
       compaction_job_info->input_files.push_back(fn);
+      compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
+          static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
       if (compaction_job_info->table_properties.count(fn) == 0) {
         std::shared_ptr<const TableProperties> tp;
         auto s = current->GetTableProperties(&tp, fmd, &fn);
@@ -2896,9 +2999,13 @@ void DBImpl::BuildCompactionJobInfo(
     }
   }
   for (const auto& newf : c->edit()->GetNewFiles()) {
-    compaction_job_info->output_files.push_back(
-        TableFileName(c->immutable_cf_options()->cf_paths,
-                      newf.second.fd.GetNumber(), newf.second.fd.GetPathId()));
+    const FileMetaData& meta = newf.second;
+    const FileDescriptor& desc = meta.fd;
+    const uint64_t file_number = desc.GetNumber();
+    compaction_job_info->output_files.push_back(TableFileName(
+        c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+    compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
+        newf.first, file_number, meta.oldest_blob_file_number});
   }
 }
 #endif
@@ -2957,34 +3064,20 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
 }
 
 // ShouldPurge is called by FindObsoleteFiles when doing a full scan,
-// and db mutex (mutex_) should already be held. This function performs a
-// linear scan of an vector (files_grabbed_for_purge_) in search of a
-// certain element. We expect FindObsoleteFiles with full scan to occur once
-// every 10 hours by default, and the size of the vector is small.
-// Therefore, the cost is affordable even if the mutex is held.
+// and db mutex (mutex_) should already be held.
 // Actually, the current implementation of FindObsoleteFiles with
 // full_scan=true can issue I/O requests to obtain list of files in
 // directories, e.g. env_->getChildren while holding db mutex.
-// In the future, if we want to reduce the cost of search, we may try to keep
-// the vector sorted.
 bool DBImpl::ShouldPurge(uint64_t file_number) const {
-  for (auto fn : files_grabbed_for_purge_) {
-    if (file_number == fn) {
-      return false;
-    }
-  }
-  for (const auto& purge_file_info : purge_queue_) {
-    if (purge_file_info.number == file_number) {
-      return false;
-    }
-  }
-  return true;
+  return files_grabbed_for_purge_.find(file_number) ==
+             files_grabbed_for_purge_.end() &&
+         purge_files_.find(file_number) == purge_files_.end();
 }
 
 // MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
 // (mutex_) should already be held.
 void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
-  files_grabbed_for_purge_.emplace_back(file_number);
+  files_grabbed_for_purge_.insert(file_number);
 }
 
 void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
diff --git a/db/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
similarity index 90%
rename from db/db_impl_debug.cc
rename to db/db_impl/db_impl_debug.cc
index f558971190e..566c175735a 100644
--- a/db/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -9,12 +9,13 @@
 
 #ifndef NDEBUG
 
-#include "db/db_impl.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
 
 namespace rocksdb {
-
 uint64_t DBImpl::TEST_GetLevel0TotalSize() {
   InstrumentedMutexLock l(&mutex_);
   return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
@@ -104,7 +105,16 @@ Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
   if (cfd == nullptr) {
     cfd = default_cf_handle_->cfd();
   }
-  return SwitchMemtable(cfd, &write_context);
+
+  if (two_write_queues_) {
+    WriteThread::Writer nonmem_w;
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+    Status s = SwitchMemtable(cfd, &write_context);
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+    return s;
+  } else {
+    return SwitchMemtable(cfd, &write_context);
+  }
 }
 
 Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
@@ -122,6 +132,16 @@ Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
   return FlushMemTable(cfd, fo, FlushReason::kTest);
 }
 
+Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
+                                  const FlushOptions& flush_opts) {
+  return FlushMemTable(cfd, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds, const FlushOptions& flush_opts) {
+  return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest);
+}
+
 Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {
@@ -262,8 +282,8 @@ bool DBImpl::TEST_IsPersistentStatsEnabled() const {
   return thread_persist_stats_ && thread_persist_stats_->IsRunning();
 }
 
-size_t DBImpl::TEST_EstiamteStatsHistorySize() const {
-  return EstiamteStatsHistorySize();
+size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
+  return EstimateInMemoryStatsHistorySize();
 }
 }  // namespace rocksdb
 #endif  // NDEBUG
diff --git a/db/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc
similarity index 96%
rename from db/db_impl_experimental.cc
rename to db/db_impl/db_impl_experimental.cc
index 47a880199e2..9a6e85ea6f5 100644
--- a/db/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@@ -7,13 +7,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <vector>
 
 #include "db/column_family.h"
@@ -132,7 +128,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
       edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
                    f->fd.smallest_seqno, f->fd.largest_seqno,
-                   f->marked_for_compaction);
+                   f->marked_for_compaction, f->oldest_blob_file_number,
+                   f->oldest_ancester_time, f->file_creation_time);
     }
 
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
diff --git a/db/db_impl_files.cc b/db/db_impl/db_impl_files.cc
similarity index 97%
rename from db/db_impl_files.cc
rename to db/db_impl/db_impl_files.cc
index b16cf87947d..1fa2884062c 100644
--- a/db/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -6,18 +6,16 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 #include <set>
 #include <unordered_set>
 #include "db/event_helpers.h"
 #include "db/memtable_list.h"
-#include "util/file_util.h"
-#include "util/sst_file_manager_impl.h"
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "util/autovector.h"
 
 namespace rocksdb {
 
@@ -261,7 +259,8 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
   Status file_deletion_status;
   if (type == kTableFile || type == kLogFile) {
     file_deletion_status =
-        DeleteDBFile(&immutable_db_options_, fname, path_to_sync);
+        DeleteDBFile(&immutable_db_options_, fname, path_to_sync,
+                     /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_);
   } else {
     file_deletion_status = env_->DeleteFile(fname);
   }
@@ -318,11 +317,9 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
       candidate_files.size() + state.sst_delete_files.size() +
       state.log_delete_files.size() + state.manifest_delete_files.size());
   // We may ignore the dbname when generating the file names.
-  const char* kDumbDbName = "";
   for (auto& file : state.sst_delete_files) {
     candidate_files.emplace_back(
-        MakeTableFileName(kDumbDbName, file.metadata->fd.GetNumber()),
-        file.path);
+        MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
     if (file.metadata->table_reader_handle) {
       table_cache_->Release(file.metadata->table_reader_handle);
     }
@@ -331,7 +328,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
 
   for (auto file_num : state.log_delete_files) {
     if (file_num > 0) {
-      candidate_files.emplace_back(LogFileName(kDumbDbName, file_num),
+      candidate_files.emplace_back(LogFileName(file_num),
                                    immutable_db_options_.wal_dir);
     }
   }
@@ -382,6 +379,12 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
     }
   }
 
+  // Close WALs before trying to delete them.
+  for (const auto w : state.logs_to_free) {
+    // TODO: maybe check the return value of Close.
+    w->Close();
+  }
+
   std::unordered_set<uint64_t> files_to_del;
   for (const auto& candidate_file : candidate_files) {
     const std::string& to_delete = candidate_file.file_name;
@@ -481,11 +484,6 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
     }
 #endif  // !ROCKSDB_LITE
 
-    for (const auto w : state.logs_to_free) {
-      // TODO: maybe check the return value of Close.
-      w->Close();
-    }
-
     Status file_deletion_status;
     if (schedule_only) {
       InstrumentedMutexLock guard_lock(&mutex_);
@@ -499,13 +497,15 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
     // After purging obsolete files, remove them from files_grabbed_for_purge_.
     // Use a temporary vector to perform bulk deletion via swap.
     InstrumentedMutexLock guard_lock(&mutex_);
-    std::vector<uint64_t> tmp;
+    autovector<uint64_t> to_be_removed;
     for (auto fn : files_grabbed_for_purge_) {
-      if (files_to_del.count(fn) == 0) {
-        tmp.emplace_back(fn);
+      if (files_to_del.count(fn) != 0) {
+        to_be_removed.emplace_back(fn);
       }
     }
-    files_grabbed_for_purge_.swap(tmp);
+    for (auto fn : to_be_removed) {
+      files_grabbed_for_purge_.erase(fn);
+    }
   }
 
   // Delete old info log files.
diff --git a/db/db_impl_open.cc b/db/db_impl/db_impl_open.cc
similarity index 82%
rename from db/db_impl_open.cc
rename to db/db_impl/db_impl_open.cc
index 1bc69b49182..9ca0a940cc7 100644
--- a/db/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -6,21 +6,21 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "db/builder.h"
 #include "db/error_handler.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/persistent_stats_history.h"
 #include "options/options_helper.h"
 #include "rocksdb/wal_filter.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "test_util/sync_point.h"
 #include "util/rate_limiter.h"
-#include "util/sst_file_manager_impl.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 Options SanitizeOptions(const std::string& dbname, const Options& src) {
@@ -124,6 +124,25 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
   }
 
 #ifndef ROCKSDB_LITE
+  ImmutableDBOptions immutable_db_options(result);
+  if (!IsWalDirSameAsDBPath(&immutable_db_options)) {
+    // Either the WAL dir and db_paths[0]/db_name are not the same, or we
+    // cannot tell for sure. In either case, assume they're different and
+    // explicitly cleanup the trash log files (bypass DeleteScheduler)
+    // Do this first so even if we end up calling
+    // DeleteScheduler::CleanupDirectory on the same dir later, it will be
+    // safe
+    std::vector<std::string> filenames;
+    result.env->GetChildren(result.wal_dir, &filenames);
+    for (std::string& filename : filenames) {
+      if (filename.find(".log.trash", filename.length() -
+                                          std::string(".log.trash").length()) !=
+          std::string::npos) {
+        std::string trash_file = result.wal_dir + "/" + filename;
+        result.env->DeleteFile(trash_file);
+      }
+    }
+  }
   // When the DB is stopped, it's possible that there are some .trash files that
   // were not deleted yet, when we open the DB we will find these .trash files
   // and schedule them to be deleted (or delete immediately if SstFileManager
@@ -145,7 +164,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
 }
 
 namespace {
-
 Status SanitizeOptionsByTable(
     const DBOptions& db_opts,
     const std::vector<ColumnFamilyDescriptor>& column_families) {
@@ -158,52 +176,23 @@ Status SanitizeOptionsByTable(
   }
   return Status::OK();
 }
+}  // namespace
 
-static Status ValidateOptions(
+Status DBImpl::ValidateOptions(
     const DBOptions& db_options,
     const std::vector<ColumnFamilyDescriptor>& column_families) {
   Status s;
-
   for (auto& cfd : column_families) {
-    s = CheckCompressionSupported(cfd.options);
-    if (s.ok() && db_options.allow_concurrent_memtable_write) {
-      s = CheckConcurrentWritesSupported(cfd.options);
-    }
-    if (s.ok()) {
-      s = CheckCFPathsSupported(db_options, cfd.options);
-    }
+    s = ColumnFamilyData::ValidateOptions(db_options, cfd.options);
     if (!s.ok()) {
       return s;
     }
-
-    if (cfd.options.ttl > 0) {
-      if (db_options.max_open_files != -1) {
-        return Status::NotSupported(
-            "TTL is only supported when files are always "
-            "kept open (set max_open_files = -1). ");
-      }
-      if (cfd.options.table_factory->Name() !=
-          BlockBasedTableFactory().Name()) {
-        return Status::NotSupported(
-            "TTL is only supported in Block-Based Table format. ");
-      }
-    }
-
-    if (cfd.options.periodic_compaction_seconds > 0) {
-      if (db_options.max_open_files != -1) {
-        return Status::NotSupported(
-            "Periodic Compaction is only supported when files are always "
-            "kept open (set max_open_files = -1). ");
-      }
-      if (cfd.options.table_factory->Name() !=
-          BlockBasedTableFactory().Name()) {
-        return Status::NotSupported(
-            "Periodic Compaction is only supported in "
-            "Block-Based Table format. ");
-      }
-    }
   }
+  s = ValidateOptions(db_options);
+  return s;
+}
 
+Status DBImpl::ValidateOptions(const DBOptions& db_options) {
   if (db_options.db_paths.size() > 4) {
     return Status::NotSupported(
         "More than four DB paths are not supported yet. ");
@@ -228,17 +217,40 @@ static Status ValidateOptions(
     return Status::InvalidArgument("keep_log_file_num must be greater than 0");
   }
 
+  if (db_options.unordered_write &&
+      !db_options.allow_concurrent_memtable_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with !allow_concurrent_memtable_write");
+  }
+
+  if (db_options.unordered_write && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with enable_pipelined_write");
+  }
+
+  if (db_options.atomic_flush && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "atomic_flush is incompatible with enable_pipelined_write");
+  }
+
   return Status::OK();
 }
-}  // namespace
+
 Status DBImpl::NewDB() {
   VersionEdit new_db;
+  Status s = SetIdentityFile(env_, dbname_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (immutable_db_options_.write_dbid_to_manifest) {
+    std::string temp_db_id;
+    GetDbIdentityFromIdentityFile(&temp_db_id);
+    new_db.SetDBId(temp_db_id);
+  }
   new_db.SetLogNumber(0);
   new_db.SetNextFile(2);
   new_db.SetLastSequence(0);
 
-  Status s;
-
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
   {
@@ -286,9 +298,9 @@ Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname,
   return env->NewDirectory(dirname, directory);
 }
 
-Status DBImpl::Directories::SetDirectories(
-    Env* env, const std::string& dbname, const std::string& wal_dir,
-    const std::vector<DbPath>& data_paths) {
+Status Directories::SetDirectories(Env* env, const std::string& dbname,
+                                   const std::string& wal_dir,
+                                   const std::vector<DbPath>& data_paths) {
   Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_);
   if (!s.ok()) {
     return s;
@@ -341,6 +353,9 @@ Status DBImpl::Recover(
     s = env_->FileExists(CurrentFileName(dbname_));
     if (s.IsNotFound()) {
       if (immutable_db_options_.create_if_missing) {
+        // Has to be called only after Identity File creation is successful
+        // because DB ID is stored in Manifest if
+        // immutable_db_options_.write_dbid_to_manifest = true
         s = NewDB();
         is_new_db = true;
         if (!s.ok()) {
@@ -360,30 +375,19 @@ Status DBImpl::Recover(
       assert(s.IsIOError());
       return s;
     }
-    // Check for the IDENTITY file and create it if not there
-    s = env_->FileExists(IdentityFileName(dbname_));
-    if (s.IsNotFound()) {
-      s = SetIdentityFile(env_, dbname_);
-      if (!s.ok()) {
-        return s;
-      }
-    } else if (!s.ok()) {
-      assert(s.IsIOError());
-      return s;
-    }
     // Verify compatibility of env_options_ and filesystem
     {
       std::unique_ptr<RandomAccessFile> idfile;
       EnvOptions customized_env(env_options_);
       customized_env.use_direct_reads |=
           immutable_db_options_.use_direct_io_for_flush_and_compaction;
-      s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile,
+      s = env_->NewRandomAccessFile(CurrentFileName(dbname_), &idfile,
                                     customized_env);
       if (!s.ok()) {
         std::string error_str = s.ToString();
         // Check if unsupported Direct I/O is the root cause
         customized_env.use_direct_reads = false;
-        s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile,
+        s = env_->NewRandomAccessFile(CurrentFileName(dbname_), &idfile,
                                       customized_env);
         if (s.ok()) {
           return Status::InvalidArgument(
@@ -395,8 +399,43 @@ Status DBImpl::Recover(
       }
     }
   }
+  assert(db_id_.empty());
+  Status s = versions_->Recover(column_families, read_only, &db_id_);
+  if (!s.ok()) {
+    return s;
+  }
+  // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
+  // the very first time.
+  if (db_id_.empty()) {
+    // Check for the IDENTITY file and create it if not there.
+    s = env_->FileExists(IdentityFileName(dbname_));
+    // Typically Identity file is created in NewDB() and for some reason if
+    // it is no longer available then at this point DB ID is not in Identity
+    // file or Manifest.
+    if (s.IsNotFound()) {
+      s = SetIdentityFile(env_, dbname_);
+      if (!s.ok()) {
+        return s;
+      }
+    } else if (!s.ok()) {
+      assert(s.IsIOError());
+      return s;
+    }
+    GetDbIdentityFromIdentityFile(&db_id_);
+    if (immutable_db_options_.write_dbid_to_manifest) {
+      VersionEdit edit;
+      edit.SetDBId(db_id_);
+      Options options;
+      MutableCFOptions mutable_cf_options(options);
+      versions_->db_id_ = db_id_;
+      versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                             mutable_cf_options, &edit, &mutex_, nullptr,
+                             false);
+    }
+  } else {
+    SetIdentityFile(env_, dbname_, db_id_);
+  }
 
-  Status s = versions_->Recover(column_families, read_only);
   if (immutable_db_options_.paranoid_checks && s.ok()) {
     s = CheckConsistency();
   }
@@ -408,6 +447,10 @@ Status DBImpl::Recover(
       }
     }
   }
+  // DB mutex is already held
+  if (s.ok() && immutable_db_options_.persist_stats_to_disk) {
+    s = InitPersistStatsColumnFamily();
+  }
 
   // Initial max_total_in_memory_state_ before recovery logs. Log recovery
   // may check this value to decide whether to flush.
@@ -423,6 +466,8 @@ Status DBImpl::Recover(
     default_cf_handle_ = new ColumnFamilyHandleImpl(
         versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
     default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+    // TODO(Zhongyi): handle single_column_family_mode_ when
+    // persistent_stats is enabled
     single_column_family_mode_ =
         versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
 
@@ -518,6 +563,98 @@ Status DBImpl::Recover(
   return s;
 }
 
+Status DBImpl::PersistentStatsProcessFormatVersion() {
+  mutex_.AssertHeld();
+  Status s;
+  // persist version when stats CF doesn't exist
+  bool should_persist_format_version = !persistent_stats_cfd_exists_;
+  mutex_.Unlock();
+  if (persistent_stats_cfd_exists_) {
+    // Check persistent stats format version compatibility. Drop and recreate
+    // persistent stats CF if format version is incompatible
+    uint64_t format_version_recovered = 0;
+    Status s_format = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kFormatVersion, &format_version_recovered);
+    uint64_t compatible_version_recovered = 0;
+    Status s_compatible = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kCompatibleVersion,
+        &compatible_version_recovered);
+    // abort reading from existing stats CF if any of following is true:
+    // 1. failed to read format version or compatible version from disk
+    // 2. sst's format version is greater than current format version, meaning
+    // this sst is encoded with a newer RocksDB release, and current compatible
+    // version is below the sst's compatible version
+    if (!s_format.ok() || !s_compatible.ok() ||
+        (kStatsCFCurrentFormatVersion < format_version_recovered &&
+         kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
+      if (!s_format.ok() || !s_compatible.ok()) {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "Reading persistent stats version key failed. Format key: %s, "
+            "compatible key: %s",
+            s_format.ToString().c_str(), s_compatible.ToString().c_str());
+      } else {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "Disable persistent stats due to corrupted or incompatible format "
+            "version\n");
+      }
+      DropColumnFamily(persist_stats_cf_handle_);
+      DestroyColumnFamilyHandle(persist_stats_cf_handle_);
+      ColumnFamilyHandle* handle = nullptr;
+      ColumnFamilyOptions cfo;
+      OptimizeForPersistentStats(&cfo);
+      s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+      persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+      // should also persist version here because old stats CF is discarded
+      should_persist_format_version = true;
+    }
+  }
+  if (s.ok() && should_persist_format_version) {
+    // Persistent stats CF being created for the first time, need to write
+    // format version key
+    WriteBatch batch;
+    batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
+              ToString(kStatsCFCurrentFormatVersion));
+    batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
+              ToString(kStatsCFCompatibleFormatVersion));
+    WriteOptions wo;
+    wo.low_pri = true;
+    wo.no_slowdown = true;
+    wo.sync = false;
+    s = Write(wo, &batch);
+  }
+  mutex_.Lock();
+  return s;
+}
+
+Status DBImpl::InitPersistStatsColumnFamily() {
+  mutex_.AssertHeld();
+  assert(!persist_stats_cf_handle_);
+  ColumnFamilyData* persistent_stats_cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(
+          kPersistentStatsColumnFamilyName);
+  persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr;
+
+  Status s;
+  if (persistent_stats_cfd != nullptr) {
+    // We are recovering from a DB which already contains persistent stats CF,
+    // the CF is already created in VersionSet::ApplyOneVersionEdit, but
+    // column family handle was not. Need to explicitly create handle here.
+    persist_stats_cf_handle_ =
+        new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_);
+  } else {
+    mutex_.Unlock();
+    ColumnFamilyHandle* handle = nullptr;
+    ColumnFamilyOptions cfo;
+    OptimizeForPersistentStats(&cfo);
+    s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+    persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+    mutex_.Lock();
+  }
+  return s;
+}
+
 // REQUIRES: log_numbers are sorted in ascending order
 Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                                SequenceNumber* next_sequence, bool read_only) {
@@ -577,12 +714,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
   bool stop_replay_for_corruption = false;
   bool flushed = false;
   uint64_t corrupted_log_number = kMaxSequenceNumber;
+  uint64_t min_log_number = MinLogNumberToKeep();
   for (auto log_number : log_numbers) {
-    if (log_number < versions_->min_log_number_to_keep_2pc()) {
+    if (log_number < min_log_number) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
                      "Skipping log #%" PRIu64
                      " since it is older than min log to keep #%" PRIu64,
-                     log_number, versions_->min_log_number_to_keep_2pc());
+                     log_number, min_log_number);
       continue;
     }
     // The previous incarnation may not have written any MANIFEST
@@ -623,7 +761,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
           continue;
         }
       }
-      file_reader.reset(new SequentialFileReader(std::move(file), fname));
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size));
     }
 
     // Create the log reader.
@@ -763,9 +902,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       // That's why we set ignore missing column families to true
       bool has_valid_writes = false;
       status = WriteBatchInternal::InsertInto(
-          &batch, column_family_memtables_.get(), &flush_scheduler_, true,
-          log_number, this, false /* concurrent_memtable_writes */,
-          next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
+          &batch, column_family_memtables_.get(), &flush_scheduler_,
+          &trim_history_scheduler_, true, log_number, this,
+          false /* concurrent_memtable_writes */, next_sequence,
+          &has_valid_writes, seq_per_batch_, batch_per_txn_);
       MaybeIgnoreError(&status);
       if (!status.ok()) {
         // We are treating this as a failure while reading since we read valid
@@ -832,6 +972,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     }
 
     flush_scheduler_.Clear();
+    trim_history_scheduler_.Clear();
     auto last_sequence = *next_sequence - 1;
     if ((*next_sequence != kMaxSequenceNumber) &&
         (versions_->LastSequence() <= last_sequence)) {
@@ -882,6 +1023,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
         continue;
       }
 
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr);
+
       // flush the final memtable (if non-empty)
       if (cfd->mem()->GetFirstSequenceNumber() != 0) {
         // If flush happened in the middle of recovery (e.g. due to memtable
@@ -901,7 +1045,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
         data_seen = true;
       }
 
-      // write MANIFEST with update
+      // Update the log number info in the version edit corresponding to this
+      // column family. Note that the version edits will be written to MANIFEST
+      // together later.
       // writing log_number in the manifest means that any log file
       // with number strongly less than (log_number + 1) is already
       // recovered and should be ignored on next reincarnation.
@@ -910,17 +1056,28 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
         edit->SetLogNumber(max_log_number + 1);
       }
+    }
+    if (status.ok()) {
       // we must mark the next log number as used, even though it's
       // not actually used. that is because VersionSet assumes
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
       versions_->MarkFileNumberUsed(max_log_number + 1);
-      status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
-                                      edit, &mutex_);
-      if (!status.ok()) {
-        // Recovery failed
-        break;
+
+      autovector<ColumnFamilyData*> cfds;
+      autovector<const MutableCFOptions*> cf_opts;
+      autovector<autovector<VersionEdit*>> edit_lists;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        cfds.push_back(cfd);
+        cf_opts.push_back(cfd->GetLatestMutableCFOptions());
+        auto iter = version_edits.find(cfd->GetID());
+        assert(iter != version_edits.end());
+        edit_lists.push_back({&iter->second});
       }
+      // write MANIFEST with update
+      status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
+                                      directories_.GetDbDir(),
+                                      /*new_descriptor_log=*/true);
     }
   }
 
@@ -993,8 +1150,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   mutex_.AssertHeld();
   const uint64_t start_micros = env_->NowMicros();
   FileMetaData meta;
-  auto pending_outputs_inserted_elem =
-      CaptureCurrentFileNumberInPendingOutputs();
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
   meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
   ReadOptions ro;
   ro.total_order_seek = true;
@@ -1017,6 +1175,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     int64_t _current_time = 0;
     env_->GetCurrentTime(&_current_time);  // ignore error
     const uint64_t current_time = static_cast<uint64_t>(_current_time);
+    meta.oldest_ancester_time = current_time;
 
     {
       auto write_hint = cfd->CalculateSSTWriteHint(0);
@@ -1066,7 +1225,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
                   meta.fd.smallest_seqno, meta.fd.largest_seqno,
-                  meta.marked_for_compaction);
+                  meta.marked_for_compaction, meta.oldest_blob_file_number,
+                  meta.oldest_ancester_time, meta.file_creation_time);
   }
 
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
@@ -1086,12 +1246,23 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
   std::vector<ColumnFamilyDescriptor> column_families;
   column_families.push_back(
       ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  if (db_options.persist_stats_to_disk) {
+    column_families.push_back(
+        ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options));
+  }
   std::vector<ColumnFamilyHandle*> handles;
   Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
   if (s.ok()) {
-    assert(handles.size() == 1);
+    if (db_options.persist_stats_to_disk) {
+      assert(handles.size() == 2);
+    } else {
+      assert(handles.size() == 1);
+    }
     // i can delete the handle since DBImpl is always holding a reference to
     // default column family
+    if (db_options.persist_stats_to_disk && handles[1] != nullptr) {
+      delete handles[1];
+    }
     delete handles[0];
   }
   return s;
@@ -1204,6 +1375,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     delete impl;
     return s;
   }
+
+  impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
   impl->mutex_.Lock();
   // Handles create_if_missing, error_if_exists
   s = impl->Recover(column_families);
@@ -1268,6 +1442,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
       s = impl->directories_.GetDbDir()->Fsync();
     }
   }
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    // try to read format version but no need to fail Open() even if it fails
+    s = impl->PersistentStatsProcessFormatVersion();
+  }
 
   if (s.ok()) {
     for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
diff --git a/db/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc
similarity index 98%
rename from db/db_impl_readonly.cc
rename to db/db_impl/db_impl_readonly.cc
index 5d7515c28e2..8a8a9c9d051 100644
--- a/db/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@@ -3,10 +3,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/db_impl_readonly.h"
+#include "db/db_impl/db_impl_readonly.h"
+#include "db/arena_wrapped_db_iter.h"
 
 #include "db/compacted_db_impl.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/merge_context.h"
 #include "monitoring/perf_context_imp.h"
diff --git a/db/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h
similarity index 89%
rename from db/db_impl_readonly.h
rename to db/db_impl/db_impl_readonly.h
index 23816210dc8..9f7ad17a475 100644
--- a/db/db_impl_readonly.h
+++ b/db/db_impl/db_impl_readonly.h
@@ -9,13 +9,17 @@
 
 #include <string>
 #include <vector>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
 class DBImplReadOnly : public DBImpl {
  public:
   DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  DBImplReadOnly(const DBImplReadOnly&) = delete;
+  void operator=(const DBImplReadOnly&) = delete;
+
   virtual ~DBImplReadOnly();
 
   // Implementations of the DB interface
@@ -115,12 +119,18 @@ class DBImplReadOnly : public DBImpl {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
  private:
   friend class DB;
-
-  // No copying allowed
-  DBImplReadOnly(const DBImplReadOnly&);
-  void operator=(const DBImplReadOnly&);
 };
 }  // namespace rocksdb
 
diff --git a/db/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
similarity index 63%
rename from db/db_impl_secondary.cc
rename to db/db_impl/db_impl_secondary.cc
index 90e979b4e58..8eac41dedb8 100644
--- a/db/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -3,22 +3,18 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/db_impl_secondary.h"
+#include "db/db_impl/db_impl_secondary.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 
-#include "db/db_iter.h"
+#include "db/arena_wrapped_db_iter.h"
 #include "db/merge_context.h"
+#include "logging/auto_roll_logger.h"
 #include "monitoring/perf_context_imp.h"
-#include "util/auto_roll_logger.h"
 
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
-
 DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
                                  const std::string& dbname)
     : DBImpl(db_options, dbname) {
@@ -35,6 +31,7 @@ Status DBImplSecondary::Recover(
     bool /*error_if_data_exists_in_logs*/) {
   mutex_.AssertHeld();
 
+  JobContext job_context(0);
   Status s;
   s = static_cast<ReactiveVersionSet*>(versions_.get())
           ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
@@ -59,49 +56,73 @@ Status DBImplSecondary::Recover(
     single_column_family_mode_ =
         versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
 
-    // Recover from all newer log files than the ones named in the
-    // descriptor.
-    std::vector<std::string> filenames;
-    s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
-    if (s.IsNotFound()) {
-      return Status::InvalidArgument("Failed to open wal_dir",
-                                     immutable_db_options_.wal_dir);
-    } else if (!s.ok()) {
-      return s;
-    }
+    std::unordered_set<ColumnFamilyData*> cfds_changed;
+    s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+  }
 
-    std::vector<uint64_t> logs;
-    // if log_readers_ is non-empty, it means we have applied all logs with log
-    // numbers smaller than the smallest log in log_readers_, so there is no
-    // need to pass these logs to RecoverLogFiles
-    uint64_t log_number_min = 0;
-    if (log_readers_.size() > 0) {
-      log_number_min = log_readers_.begin()->first;
-    }
-    for (size_t i = 0; i < filenames.size(); i++) {
-      uint64_t number;
-      FileType type;
-      if (ParseFileName(filenames[i], &number, &type) && type == kLogFile &&
-          number >= log_number_min) {
-        logs.push_back(number);
-      }
-    }
+  if (s.IsPathNotFound()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Secondary tries to read WAL, but WAL file(s) have already "
+                   "been purged by primary.");
+    s = Status::OK();
+  }
+  // TODO: update options_file_number_ needed?
 
-    if (!logs.empty()) {
-      // Recover in the order in which the logs were generated
-      std::sort(logs.begin(), logs.end());
-      SequenceNumber next_sequence(kMaxSequenceNumber);
-      s = RecoverLogFiles(logs, &next_sequence, true /*read_only*/);
-    }
+  job_context.Clean();
+  return s;
+}
+
+// find new WAL and apply them in order to the secondary instance
+Status DBImplSecondary::FindAndRecoverLogFiles(
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
+  Status s;
+  std::vector<uint64_t> logs;
+  s = FindNewLogNumbers(&logs);
+  if (s.ok() && !logs.empty()) {
+    SequenceNumber next_sequence(kMaxSequenceNumber);
+    s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context);
   }
+  return s;
+}
 
-  // TODO: update options_file_number_ needed?
+// List wal_dir and find all new WALs, return these log numbers
+Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
+  assert(logs != nullptr);
+  std::vector<std::string> filenames;
+  Status s;
+  s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
+  if (s.IsNotFound()) {
+    return Status::InvalidArgument("Failed to open wal_dir",
+                                   immutable_db_options_.wal_dir);
+  } else if (!s.ok()) {
+    return s;
+  }
 
+  // if log_readers_ is non-empty, it means we have applied all logs with log
+  // numbers smaller than the smallest log in log_readers_, so there is no
+  // need to pass these logs to RecoverLogFiles
+  uint64_t log_number_min = 0;
+  if (!log_readers_.empty()) {
+    log_number_min = log_readers_.begin()->first;
+  }
+  for (size_t i = 0; i < filenames.size(); i++) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(filenames[i], &number, &type) && type == kLogFile &&
+        number >= log_number_min) {
+      logs->push_back(number);
+    }
+  }
+  // Recover logs in the order that they were generated
+  if (!logs->empty()) {
+    std::sort(logs->begin(), logs->end());
+  }
   return s;
 }
 
-// try to find log reader using log_number from log_readers_ map, initialize
-// if it doesn't exist
 Status DBImplSecondary::MaybeInitLogReader(
     uint64_t log_number, log::FragmentBufferedReader** log_reader) {
   auto iter = log_readers_.find(log_number);
@@ -129,7 +150,8 @@ Status DBImplSecondary::MaybeInitLogReader(
         *log_reader = nullptr;
         return status;
       }
-      file_reader.reset(new SequentialFileReader(std::move(file), fname));
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size));
     }
 
     // Create the log reader.
@@ -149,7 +171,10 @@ Status DBImplSecondary::MaybeInitLogReader(
 // REQUIRES: log_numbers are sorted in ascending order
 Status DBImplSecondary::RecoverLogFiles(
     const std::vector<uint64_t>& log_numbers, SequenceNumber* next_sequence,
-    bool /*read_only*/) {
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
   mutex_.AssertHeld();
   Status status;
   for (auto log_number : log_numbers) {
@@ -182,9 +207,56 @@ Status DBImplSecondary::RecoverLogFiles(
         continue;
       }
       WriteBatchInternal::SetContents(&batch, record);
-      // do not check sequence number because user may toggle disableWAL
-      // between writes which breaks sequence number continuity guarantee
-
+      SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
+      std::vector<uint32_t> column_family_ids;
+      status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
+      if (status.ok()) {
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          if (cfds_changed->count(cfd) == 0) {
+            cfds_changed->insert(cfd);
+          }
+          const std::vector<FileMetaData*>& l0_files =
+              cfd->current()->storage_info()->LevelFiles(0);
+          SequenceNumber seq =
+              l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno;
+          // If the write batch's sequence number is smaller than the last
+          // sequence number of the largest sequence persisted for this column
+          // family, then its data must reside in an SST that has already been
+          // added in the prior MANIFEST replay.
+          if (seq_of_batch <= seq) {
+            continue;
+          }
+          auto curr_log_num = port::kMaxUint64;
+          if (cfd_to_current_log_.count(cfd) > 0) {
+            curr_log_num = cfd_to_current_log_[cfd];
+          }
+          // If the active memtable contains records added by replaying an
+          // earlier WAL, then we need to seal the memtable, add it to the
+          // immutable memtable list and create a new active memtable.
+          if (!cfd->mem()->IsEmpty() && (curr_log_num == port::kMaxUint64 ||
+                                         curr_log_num != log_number)) {
+            const MutableCFOptions mutable_cf_options =
+                *cfd->GetLatestMutableCFOptions();
+            MemTable* new_mem =
+                cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch);
+            cfd->mem()->SetNextLogNumber(log_number);
+            cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
+            new_mem->Ref();
+            cfd->SetMemtable(new_mem);
+          }
+        }
+        bool has_valid_writes = false;
+        status = WriteBatchInternal::InsertInto(
+            &batch, column_family_memtables_.get(),
+            nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/,
+            true, log_number, this, false /* concurrent_memtable_writes */,
+            next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
+      }
       // If column family was not found, it might mean that the WAL write
       // batch references to the column family that was dropped after the
       // insert. We don't want to fail the whole write batch in that case --
@@ -192,36 +264,43 @@ Status DBImplSecondary::RecoverLogFiles(
       // That's why we set ignore missing column families to true
       // passing null flush_scheduler will disable memtable flushing which is
       // needed for secondary instances
-      bool has_valid_writes = false;
-      status = WriteBatchInternal::InsertInto(
-          &batch, column_family_memtables_.get(), nullptr /* flush_scheduler */,
-          true, log_number, this, false /* concurrent_memtable_writes */,
-          next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
-      if (!status.ok()) {
+      if (status.ok()) {
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          std::unordered_map<ColumnFamilyData*, uint64_t>::iterator iter =
+              cfd_to_current_log_.find(cfd);
+          if (iter == cfd_to_current_log_.end()) {
+            cfd_to_current_log_.insert({cfd, log_number});
+          } else if (log_number > iter->second) {
+            iter->second = log_number;
+          }
+        }
+        auto last_sequence = *next_sequence - 1;
+        if ((*next_sequence != kMaxSequenceNumber) &&
+            (versions_->LastSequence() <= last_sequence)) {
+          versions_->SetLastAllocatedSequence(last_sequence);
+          versions_->SetLastPublishedSequence(last_sequence);
+          versions_->SetLastSequence(last_sequence);
+        }
+      } else {
         // We are treating this as a failure while reading since we read valid
         // blocks that do not form coherent data
         reader->GetReporter()->Corruption(record.size(), status);
-        continue;
       }
     }
-
     if (!status.ok()) {
       return status;
     }
-
-    auto last_sequence = *next_sequence - 1;
-    if ((*next_sequence != kMaxSequenceNumber) &&
-        (versions_->LastSequence() <= last_sequence)) {
-      versions_->SetLastAllocatedSequence(last_sequence);
-      versions_->SetLastPublishedSequence(last_sequence);
-      versions_->SetLastSequence(last_sequence);
-    }
   }
   // remove logreaders from map after successfully recovering the WAL
   if (log_readers_.size() > 1) {
-    auto eraseIter = log_readers_.begin();
-    std::advance(eraseIter, log_readers_.size() - 1);
-    log_readers_.erase(log_readers_.begin(), eraseIter);
+    auto erase_iter = log_readers_.begin();
+    std::advance(erase_iter, log_readers_.size() - 1);
+    log_readers_.erase(log_readers_.begin(), erase_iter);
   }
   return status;
 }
@@ -373,21 +452,89 @@ Status DBImplSecondary::NewIterators(
   return Status::OK();
 }
 
+Status DBImplSecondary::CheckConsistency() {
+  mutex_.AssertHeld();
+  Status s = DBImpl::CheckConsistency();
+  // If DBImpl::CheckConsistency() which is stricter returns success, then we
+  // do not need to give a second chance.
+  if (s.ok()) {
+    return s;
+  }
+  // It's possible that DBImpl::CheckConssitency() can fail because the primary
+  // may have removed certain files, causing the GetFileSize(name) call to
+  // fail and returning a PathNotFound. In this case, we take a best-effort
+  // approach and just proceed.
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
+  std::vector<LiveFileMetaData> metadata;
+  versions_->GetLiveFilesMetaData(&metadata);
+
+  std::string corruption_messages;
+  for (const auto& md : metadata) {
+    // md.name has a leading "/".
+    std::string file_path = md.db_path + md.name;
+
+    uint64_t fsize = 0;
+    s = env_->GetFileSize(file_path, &fsize);
+    if (!s.ok() &&
+        (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
+         s.IsPathNotFound())) {
+      s = Status::OK();
+    }
+    if (!s.ok()) {
+      corruption_messages +=
+          "Can't access " + md.name + ": " + s.ToString() + "\n";
+    }
+  }
+  return corruption_messages.empty() ? Status::OK()
+                                     : Status::Corruption(corruption_messages);
+}
+
 Status DBImplSecondary::TryCatchUpWithPrimary() {
   assert(versions_.get() != nullptr);
   assert(manifest_reader_.get() != nullptr);
   Status s;
+  // read the manifest and apply new changes to the secondary instance
   std::unordered_set<ColumnFamilyData*> cfds_changed;
+  JobContext job_context(0, true /*create_superversion*/);
   InstrumentedMutexLock lock_guard(&mutex_);
   s = static_cast<ReactiveVersionSet*>(versions_.get())
           ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
+                 static_cast<uint64_t>(versions_->LastSequence()));
+  for (ColumnFamilyData* cfd : cfds_changed) {
+    if (cfd->IsDropped()) {
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
+                      cfd->GetName().c_str());
+      continue;
+    }
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Level summary: %s\n",
+                    cfd->GetName().c_str(),
+                    cfd->current()->storage_info()->LevelSummary(&tmp));
+  }
+
+  // list wal_dir to discover new WALs and apply new changes to the secondary
+  // instance
+  if (s.ok()) {
+    s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+  }
+  if (s.IsPathNotFound()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Secondary tries to read WAL, but WAL file(s) have already "
+                   "been purged by primary.");
+    s = Status::OK();
+  }
   if (s.ok()) {
-    SuperVersionContext sv_context(true /* create_superversion */);
     for (auto cfd : cfds_changed) {
-      sv_context.NewSuperVersion();
+      cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
+                                     &job_context.memtables_to_free);
+      auto& sv_context = job_context.superversion_contexts.back();
       cfd->InstallSuperVersion(&sv_context, &mutex_);
+      sv_context.NewSuperVersion();
     }
-    sv_context.Clean();
+    job_context.Clean();
   }
   return s;
 }
@@ -425,39 +572,14 @@ Status DB::OpenAsSecondary(
   }
 
   DBOptions tmp_opts(db_options);
+  Status s;
   if (nullptr == tmp_opts.info_log) {
-    Env* env = tmp_opts.env;
-    assert(env != nullptr);
-    std::string secondary_abs_path;
-    env->GetAbsolutePath(secondary_path, &secondary_abs_path);
-    std::string fname = InfoLogFileName(secondary_path, secondary_abs_path,
-                                        tmp_opts.db_log_dir);
-
-    env->CreateDirIfMissing(secondary_path);
-    if (tmp_opts.log_file_time_to_roll > 0 || tmp_opts.max_log_file_size > 0) {
-      AutoRollLogger* result = new AutoRollLogger(
-          env, secondary_path, tmp_opts.db_log_dir, tmp_opts.max_log_file_size,
-          tmp_opts.log_file_time_to_roll, tmp_opts.info_log_level);
-      Status s = result->GetStatus();
-      if (!s.ok()) {
-        delete result;
-      } else {
-        tmp_opts.info_log.reset(result);
-      }
-    }
-    if (nullptr == tmp_opts.info_log) {
-      env->RenameFile(
-          fname, OldInfoLogFileName(secondary_path, env->NowMicros(),
-                                    secondary_abs_path, tmp_opts.db_log_dir));
-      Status s = env->NewLogger(fname, &(tmp_opts.info_log));
-      if (tmp_opts.info_log != nullptr) {
-        tmp_opts.info_log->SetInfoLogLevel(tmp_opts.info_log_level);
-      }
+    s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log);
+    if (!s.ok()) {
+      tmp_opts.info_log = nullptr;
     }
   }
 
-  assert(tmp_opts.info_log != nullptr);
-
   handles->clear();
   DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname);
   impl->versions_.reset(new ReactiveVersionSet(
@@ -466,8 +588,10 @@ Status DB::OpenAsSecondary(
       &impl->write_controller_));
   impl->column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
+  impl->wal_in_db_path_ = IsWalDirSameAsDBPath(&impl->immutable_db_options_);
+
   impl->mutex_.Lock();
-  Status s = impl->Recover(column_families, true, false, false);
+  s = impl->Recover(column_families, true, false, false);
   if (s.ok()) {
     for (auto cf : column_families) {
       auto cfd =
diff --git a/db/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
similarity index 58%
rename from db/db_impl_secondary.h
rename to db/db_impl/db_impl_secondary.h
index 64c81432848..ca853e25802 100644
--- a/db/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -9,10 +9,11 @@
 
 #include <string>
 #include <vector>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
+// A wrapper class to hold log reader, log reporter, log status.
 class LogReaderContainer {
  public:
   LogReaderContainer()
@@ -62,11 +63,19 @@ class LogReaderContainer {
   };
 };
 
+// The secondary instance shares access to the storage as the primary.
+// The secondary is able to read and replay changes described in both the
+// MANIFEST and the WAL files without coordination with the primary.
+// The secondary instance can be opened using `DB::OpenAsSecondary`. After
+// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
+// effort attempts to catch up with the primary.
 class DBImplSecondary : public DBImpl {
  public:
   DBImplSecondary(const DBOptions& options, const std::string& dbname);
   ~DBImplSecondary() override;
 
+  // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
+  // and log_readers_ to facilitate future operations.
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                  bool read_only, bool error_if_log_file_exist,
                  bool error_if_data_exists_in_logs) override;
@@ -96,40 +105,40 @@ class DBImplSecondary : public DBImpl {
   Status Put(const WriteOptions& /*options*/,
              ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
              const Slice& /*value*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::Merge;
   Status Merge(const WriteOptions& /*options*/,
                ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
                const Slice& /*value*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::Delete;
   Status Delete(const WriteOptions& /*options*/,
                 ColumnFamilyHandle* /*column_family*/,
                 const Slice& /*key*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::SingleDelete;
   Status SingleDelete(const WriteOptions& /*options*/,
                       ColumnFamilyHandle* /*column_family*/,
                       const Slice& /*key*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   Status Write(const WriteOptions& /*options*/,
                WriteBatch* /*updates*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::CompactRange;
   Status CompactRange(const CompactRangeOptions& /*options*/,
                       ColumnFamilyHandle* /*column_family*/,
                       const Slice* /*begin*/, const Slice* /*end*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::CompactFiles;
@@ -140,32 +149,32 @@ class DBImplSecondary : public DBImpl {
       const int /*output_level*/, const int /*output_path_id*/ = -1,
       std::vector<std::string>* const /*output_file_names*/ = nullptr,
       CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   Status DisableFileDeletions() override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   Status EnableFileDeletions(bool /*force*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   Status GetLiveFiles(std::vector<std::string>&,
                       uint64_t* /*manifest_file_size*/,
                       bool /*flush_memtable*/ = true) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::Flush;
   Status Flush(const FlushOptions& /*options*/,
                ColumnFamilyHandle* /*column_family*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DBImpl::SyncWAL;
   Status SyncWAL() override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   using DB::IngestExternalFile;
@@ -173,7 +182,7 @@ class DBImplSecondary : public DBImpl {
       ColumnFamilyHandle* /*column_family*/,
       const std::vector<std::string>& /*external_files*/,
       const IngestExternalFileOptions& /*ingestion_options*/) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+    return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
   // Try to catch up with the primary by reading as much as possible from the
@@ -182,9 +191,84 @@ class DBImplSecondary : public DBImpl {
   // method can take long time due to all the I/O and CPU costs.
   Status TryCatchUpWithPrimary() override;
 
+
+  // Try to find log reader using log_number from log_readers_ map, initialize
+  // if it doesn't exist
   Status MaybeInitLogReader(uint64_t log_number,
                             log::FragmentBufferedReader** log_reader);
 
+  // Check if all live files exist on file system and that their file sizes
+  // matche to the in-memory records. It is possible that some live files may
+  // have been deleted by the primary. In this case, CheckConsistency() does
+  // not flag the missing file as inconsistency.
+  Status CheckConsistency() override;
+
+ protected:
+  // ColumnFamilyCollector is a write batch handler which does nothing
+  // except recording unique column family IDs
+  class ColumnFamilyCollector : public WriteBatch::Handler {
+    std::unordered_set<uint32_t> column_family_ids_;
+
+    Status AddColumnFamilyId(uint32_t column_family_id) {
+      if (column_family_ids_.find(column_family_id) ==
+          column_family_ids_.end()) {
+        column_family_ids_.insert(column_family_id);
+      }
+      return Status::OK();
+    }
+
+   public:
+    explicit ColumnFamilyCollector() {}
+
+    ~ColumnFamilyCollector() override {}
+
+    Status PutCF(uint32_t column_family_id, const Slice&,
+                 const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice&,
+                         const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status MergeCF(uint32_t column_family_id, const Slice&,
+                   const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,
+                          const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    const std::unordered_set<uint32_t>& column_families() const {
+      return column_family_ids_;
+    }
+  };
+
+  Status CollectColumnFamilyIdsFromWriteBatch(
+      const WriteBatch& batch, std::vector<uint32_t>* column_family_ids) {
+    assert(column_family_ids != nullptr);
+    column_family_ids->clear();
+    ColumnFamilyCollector handler;
+    Status s = batch.Iterate(&handler);
+    if (s.ok()) {
+      for (const auto& cf : handler.column_families()) {
+        column_family_ids->push_back(cf);
+      }
+    }
+    return s;
+  }
+
  private:
   friend class DB;
 
@@ -194,17 +278,27 @@ class DBImplSecondary : public DBImpl {
 
   using DBImpl::Recover;
 
+  Status FindAndRecoverLogFiles(
+      std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      JobContext* job_context);
+  Status FindNewLogNumbers(std::vector<uint64_t>* logs);
+  // After manifest recovery, replay WALs and refresh log_readers_ if necessary
+  // REQUIRES: log_numbers are sorted in ascending order
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                          SequenceNumber* next_sequence,
-                         bool read_only) override;
+                         std::unordered_set<ColumnFamilyData*>* cfds_changed,
+                         JobContext* job_context);
 
   std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
   std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
   std::unique_ptr<Status> manifest_reader_status_;
 
-  // cache log readers for each log number, used for continue WAL replay
+  // Cache log readers for each log number, used for continue WAL replay
   // after recovery
   std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
+
+  // Current WAL number replayed for each column family.
+  std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
 };
 
 }  // namespace rocksdb
diff --git a/db/db_impl_write.cc b/db/db_impl/db_impl_write.cc
similarity index 80%
rename from db/db_impl_write.cc
rename to db/db_impl/db_impl_write.cc
index 3edec9ac521..34193cabc95 100644
--- a/db/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -6,17 +6,14 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-#include <inttypes.h>
+#include <cinttypes>
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "monitoring/perf_context_imp.h"
 #include "options/options_helper.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 // Convenience methods
@@ -94,6 +91,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     return Status::NotSupported(
         "pipelined_writes is not compatible with seq_per_batch");
   }
+  if (immutable_db_options_.unordered_write &&
+      immutable_db_options_.enable_pipelined_write) {
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with unordered_write");
+  }
   // Otherwise IsLatestPersistentState optimization does not make sense
   assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
          disable_memtable);
@@ -107,8 +109,39 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   }
 
   if (two_write_queues_ && disable_memtable) {
-    return WriteImplWALOnly(write_options, my_batch, callback, log_used,
-                            log_ref, seq_used, batch_cnt, pre_release_callback);
+    AssignOrder assign_order =
+        seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder;
+    // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and
+    // they don't consume sequence.
+    return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch,
+                            callback, log_used, log_ref, seq_used, batch_cnt,
+                            pre_release_callback, assign_order,
+                            kDontPublishLastSeq, disable_memtable);
+  }
+
+  if (immutable_db_options_.unordered_write) {
+    const size_t sub_batch_cnt = batch_cnt != 0
+                                     ? batch_cnt
+                                     // every key is a sub-batch consuming a seq
+                                     : WriteBatchInternal::Count(my_batch);
+    uint64_t seq;
+    // Use a write thread to i) optimize for WAL write, ii) publish last
+    // sequence in in increasing order, iii) call pre_release_callback serially
+    status = WriteImplWALOnly(&write_thread_, write_options, my_batch, callback,
+                              log_used, log_ref, &seq, sub_batch_cnt,
+                              pre_release_callback, kDoAssignOrder,
+                              kDoPublishLastSeq, disable_memtable);
+    if (!status.ok()) {
+      return status;
+    }
+    if (seq_used) {
+      *seq_used = seq;
+    }
+    if (!disable_memtable) {
+      status = UnorderedWriteMemtable(write_options, my_batch, callback,
+                                      log_ref, seq, sub_batch_cnt);
+    }
+    return status;
   }
 
   if (immutable_db_options_.enable_pipelined_write) {
@@ -138,8 +171,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           versions_->GetColumnFamilySet());
       w.status = WriteBatchInternal::InsertInto(
           &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+          &trim_history_scheduler_,
           write_options.ignore_missing_column_families, 0 /*log_number*/, this,
-          true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt);
+          true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
+          batch_per_txn_, write_options.memtable_insert_hint_per_batch);
 
       PERF_TIMER_START(write_pre_and_post_process_time);
     }
@@ -178,9 +213,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   WriteThread::WriteGroup write_group;
   bool in_parallel_group = false;
   uint64_t last_sequence = kMaxSequenceNumber;
-  if (!two_write_queues_) {
-    last_sequence = versions_->LastSequence();
-  }
 
   mutex_.Lock();
 
@@ -195,6 +227,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     PERF_TIMER_STOP(write_pre_and_post_process_time);
 
     status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+    if (!two_write_queues_) {
+      // Assign it after ::PreprocessWrite since the sequence might advance
+      // inside it by WriteRecoverableState
+      last_sequence = versions_->LastSequence();
+    }
 
     PERF_TIMER_START(write_pre_and_post_process_time);
   }
@@ -228,6 +265,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     size_t total_count = 0;
     size_t valid_batches = 0;
     size_t total_byte_size = 0;
+    size_t pre_release_callback_cnt = 0;
     for (auto* writer : write_group) {
       if (writer->CheckCallback(this)) {
         valid_batches += writer->batch_cnt;
@@ -235,9 +273,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           total_count += WriteBatchInternal::Count(writer->batch);
           parallel = parallel && !writer->batch->HasMerge();
         }
-
         total_byte_size = WriteBatchInternal::AppendedByteSize(
             total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+        if (writer->pre_release_callback) {
+          pre_release_callback_cnt++;
+        }
       }
     }
     // Note about seq_per_batch_: either disableWAL is set for the entire write
@@ -254,18 +294,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // We're optimistic, updating the stats before we successfully
     // commit.  That lets us release our leader status early.
     auto stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count,
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
                       concurrent_update);
     RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
-    stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size,
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
                       concurrent_update);
     RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
-    stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1, concurrent_update);
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                      concurrent_update);
     RecordTick(stats_, WRITE_DONE_BY_SELF);
     auto write_done_by_other = write_group.size - 1;
     if (write_done_by_other > 0) {
-      stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, write_done_by_other,
-                        concurrent_update);
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                        write_done_by_other, concurrent_update);
       RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
     }
     RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
@@ -301,6 +342,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // PreReleaseCallback is called after WAL write and before memtable write
     if (status.ok()) {
       SequenceNumber next_sequence = current_sequence;
+      size_t index = 0;
       // Note: the logic for advancing seq here must be consistent with the
       // logic in WriteBatchInternal::InsertInto(write_group...) as well as
       // with WriteBatchInternal::InsertInto(write_batch...) that is called on
@@ -312,7 +354,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
         writer->sequence = next_sequence;
         if (writer->pre_release_callback) {
           Status ws = writer->pre_release_callback->Callback(
-              writer->sequence, disable_memtable, writer->log_used);
+              writer->sequence, disable_memtable, writer->log_used, index++,
+              pre_release_callback_cnt);
           if (!ws.ok()) {
             status = ws;
             break;
@@ -334,7 +377,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
         // w.sequence will be set inside InsertInto
         w.status = WriteBatchInternal::InsertInto(
             write_group, current_sequence, column_family_memtables_.get(),
-            &flush_scheduler_, write_options.ignore_missing_column_families,
+            &flush_scheduler_, &trim_history_scheduler_,
+            write_options.ignore_missing_column_families,
             0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
             batch_per_txn_);
       } else {
@@ -350,9 +394,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           assert(w.sequence == current_sequence);
           w.status = WriteBatchInternal::InsertInto(
               &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+              &trim_history_scheduler_,
               write_options.ignore_missing_column_families, 0 /*log_number*/,
               this, true /*concurrent_memtable_writes*/, seq_per_batch_,
-              w.batch_cnt, batch_per_txn_);
+              w.batch_cnt, batch_per_txn_,
+              write_options.memtable_insert_hint_per_batch);
         }
       }
       if (seq_used != nullptr) {
@@ -459,9 +505,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     }
 
     auto stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
     RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
-    stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size);
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
     RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
     RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
 
@@ -469,10 +515,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
 
     if (w.status.ok() && !write_options.disableWAL) {
       PERF_TIMER_GUARD(write_wal_time);
-      stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
       RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
       if (wal_write_group.size > 1) {
-        stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
+        stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
                           wal_write_group.size - 1);
         RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
       }
@@ -504,9 +550,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     } else {
       memtable_write_group.status = WriteBatchInternal::InsertInto(
           memtable_write_group, w.sequence, column_family_memtables_.get(),
-          &flush_scheduler_, write_options.ignore_missing_column_families,
-          0 /*log_number*/, this, false /*concurrent_memtable_writes*/,
-          seq_per_batch_, batch_per_txn_);
+          &flush_scheduler_, &trim_history_scheduler_,
+          write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+          false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
       versions_->SetLastSequence(memtable_write_group.last_sequence);
       write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
     }
@@ -518,8 +564,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
         versions_->GetColumnFamilySet());
     w.status = WriteBatchInternal::InsertInto(
         &w, w.sequence, &column_family_memtables, &flush_scheduler_,
-        write_options.ignore_missing_column_families, 0 /*log_number*/, this,
-        true /*concurrent_memtable_writes*/);
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
     if (write_thread_.CompleteParallelMemTableWriter(&w)) {
       MemTableInsertStatusCheck(w.status);
       versions_->SetLastSequence(w.write_group->last_sequence);
@@ -534,23 +582,72 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
   return w.FinalStatus();
 }
 
+Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
+                                      WriteBatch* my_batch,
+                                      WriteCallback* callback, uint64_t log_ref,
+                                      SequenceNumber seq,
+                                      const size_t sub_batch_cnt) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        false /*disable_memtable*/);
+
+  if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
+    w.sequence = seq;
+    size_t total_count = WriteBatchInternal::Count(my_batch);
+    InternalStats* stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+
+    ColumnFamilyMemTablesImpl column_family_memtables(
+        versions_->GetColumnFamilySet());
+    w.status = WriteBatchInternal::InsertInto(
+        &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
+
+    WriteStatusCheck(w.status);
+    if (write_options.disableWAL) {
+      has_unpersisted_data_.store(true, std::memory_order_relaxed);
+    }
+  }
+
+  size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
+  if (pending_cnt == 0) {
+    // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex
+    // before notify ensures that cv is in waiting state when it is notified
+    // thus not missing the update to pending_memtable_writes_ even though it is
+    // not modified under the mutex.
+    std::lock_guard<std::mutex> lck(switch_mutex_);
+    switch_cv_.notify_all();
+  }
+
+  if (!w.FinalStatus().ok()) {
+    return w.FinalStatus();
+  }
+  return Status::OK();
+}
+
 // The 2nd write queue. If enabled it will be used only for WAL-only writes.
 // This is the only queue that updates LastPublishedSequence which is only
 // applicable in a two-queue setting.
-Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
-                                WriteBatch* my_batch, WriteCallback* callback,
-                                uint64_t* log_used, uint64_t log_ref,
-                                uint64_t* seq_used, size_t batch_cnt,
-                                PreReleaseCallback* pre_release_callback) {
+Status DBImpl::WriteImplWALOnly(
+    WriteThread* write_thread, const WriteOptions& write_options,
+    WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used,
+    const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+    PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+    const PublishLastSeq publish_last_seq, const bool disable_memtable) {
   Status status;
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
-                        true /* disable_memtable */, batch_cnt,
-                        pre_release_callback);
+                        disable_memtable, sub_batch_cnt, pre_release_callback);
   RecordTick(stats_, WRITE_WITH_WAL);
   StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
 
-  nonmem_write_thread_.JoinBatchGroup(&w);
+  write_thread->JoinBatchGroup(&w);
   assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
   if (w.state == WriteThread::STATE_COMPLETED) {
     if (log_used != nullptr) {
@@ -563,17 +660,45 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   }
   // else we are the leader of the write batch group
   assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+  if (publish_last_seq == kDoPublishLastSeq) {
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+    WriteContext write_context;
+    if (error_handler_.IsDBStopped()) {
+      status = error_handler_.GetBGError();
+    }
+    // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
+    // without paying the cost of obtaining the mutex.
+    if (status.ok()) {
+      InstrumentedMutexLock l(&mutex_);
+      bool need_log_sync = false;
+      status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+      WriteStatusCheck(status);
+    }
+    if (!status.ok()) {
+      WriteThread::WriteGroup write_group;
+      write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+      write_thread->ExitAsBatchGroupLeader(write_group, status);
+      return status;
+    }
+  }
+
   WriteThread::WriteGroup write_group;
   uint64_t last_sequence;
-  nonmem_write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
+  write_thread->EnterAsBatchGroupLeader(&w, &write_group);
   // Note: no need to update last_batch_group_size_ here since the batch writes
   // to WAL only
 
+  size_t pre_release_callback_cnt = 0;
   size_t total_byte_size = 0;
   for (auto* writer : write_group) {
     if (writer->CheckCallback(this)) {
       total_byte_size = WriteBatchInternal::AppendedByteSize(
           total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+      if (writer->pre_release_callback) {
+        pre_release_callback_cnt++;
+      }
     }
   }
 
@@ -583,15 +708,16 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   // We're optimistic, updating the stats before we successfully
   // commit.  That lets us release our leader status early.
   auto stats = default_cf_internal_stats_;
-  stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size,
+  stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
                     concurrent_update);
   RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
-  stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1, concurrent_update);
+  stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                    concurrent_update);
   RecordTick(stats_, WRITE_DONE_BY_SELF);
   auto write_done_by_other = write_group.size - 1;
   if (write_done_by_other > 0) {
-    stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, write_done_by_other,
-                      concurrent_update);
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                      write_done_by_other, concurrent_update);
     RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
   }
   RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
@@ -602,11 +728,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   // LastAllocatedSequence is increased inside WriteToWAL under
   // wal_write_mutex_ to ensure ordered events in WAL
   size_t seq_inc = 0 /* total_count */;
-  if (seq_per_batch_) {
+  if (assign_order == kDoAssignOrder) {
     size_t total_batch_cnt = 0;
     for (auto* writer : write_group) {
-      assert(writer->batch_cnt);
-      total_batch_cnt += writer->batch_cnt;
+      assert(writer->batch_cnt || !seq_per_batch_);
+      if (!writer->CallbackFailed()) {
+        total_batch_cnt += writer->batch_cnt;
+      }
     }
     seq_inc = total_batch_cnt;
   }
@@ -617,16 +745,21 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
     // Otherwise we inc seq number to do solely the seq allocation
     last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
   }
+
+  size_t memtable_write_cnt = 0;
   auto curr_seq = last_sequence + 1;
   for (auto* writer : write_group) {
     if (writer->CallbackFailed()) {
       continue;
     }
     writer->sequence = curr_seq;
-    if (seq_per_batch_) {
-      assert(writer->batch_cnt);
+    if (assign_order == kDoAssignOrder) {
+      assert(writer->batch_cnt || !seq_per_batch_);
       curr_seq += writer->batch_cnt;
     }
+    if (!writer->disable_memtable) {
+      memtable_write_cnt++;
+    }
     // else seq advances only by memtable writes
   }
   if (status.ok() && write_options.sync) {
@@ -645,12 +778,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
     WriteStatusCheck(status);
   }
   if (status.ok()) {
+    size_t index = 0;
     for (auto* writer : write_group) {
       if (!writer->CallbackFailed() && writer->pre_release_callback) {
         assert(writer->sequence != kMaxSequenceNumber);
-        const bool DISABLE_MEMTABLE = true;
         Status ws = writer->pre_release_callback->Callback(
-            writer->sequence, DISABLE_MEMTABLE, writer->log_used);
+            writer->sequence, disable_memtable, writer->log_used, index++,
+            pre_release_callback_cnt);
         if (!ws.ok()) {
           status = ws;
           break;
@@ -658,7 +792,15 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
       }
     }
   }
-  nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, status);
+  if (publish_last_seq == kDoPublishLastSeq) {
+    versions_->SetLastSequence(last_sequence + seq_inc);
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+  }
+  if (immutable_db_options_.unordered_write && status.ok()) {
+    pending_memtable_writes_ += memtable_write_cnt;
+  }
+  write_thread->ExitAsBatchGroupLeader(write_group, status);
   if (status.ok()) {
     status = w.FinalStatus();
   }
@@ -710,6 +852,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
          versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
   if (UNLIKELY(status.ok() && !single_column_family_mode_ &&
                total_log_size_ > GetMaxTotalWalSize())) {
+    WaitForPendingWrites();
     status = SwitchWAL(write_context);
   }
 
@@ -719,10 +862,16 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
     // thread is writing to another DB with the same write buffer, they may also
     // be flushed. We may end up with flushing much more DBs than needed. It's
     // suboptimal but still correct.
+    WaitForPendingWrites();
     status = HandleWriteBufferFull(write_context);
   }
 
+  if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
+    status = TrimMemtableHistory(write_context);
+  }
+
   if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+    WaitForPendingWrites();
     status = ScheduleFlushes(write_context);
   }
 
@@ -900,12 +1049,12 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   if (status.ok()) {
     auto stats = default_cf_internal_stats_;
     if (need_log_sync) {
-      stats->AddDBStats(InternalStats::WAL_FILE_SYNCED, 1);
+      stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
       RecordTick(stats_, WAL_FILE_SYNCED);
     }
-    stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size);
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
     RecordTick(stats_, WAL_FILE_BYTES, log_size);
-    stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal);
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
     RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
   }
   return status;
@@ -951,9 +1100,10 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
   if (status.ok()) {
     const bool concurrent = true;
     auto stats = default_cf_internal_stats_;
-    stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size, concurrent);
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
+                      concurrent);
     RecordTick(stats_, WAL_FILE_BYTES, log_size);
-    stats->AddDBStats(InternalStats::WRITE_WITH_WAL, write_with_wal,
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
                       concurrent);
     RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
   }
@@ -977,9 +1127,9 @@ Status DBImpl::WriteRecoverableState() {
     WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
     auto status = WriteBatchInternal::InsertInto(
         &cached_recoverable_state_, column_family_memtables_.get(),
-        &flush_scheduler_, true, 0 /*recovery_log_number*/, this,
-        false /* concurrent_memtable_writes */, &next_seq, &dont_care_bool,
-        seq_per_batch_);
+        &flush_scheduler_, &trim_history_scheduler_, true,
+        0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
+        &next_seq, &dont_care_bool, seq_per_batch_);
     auto last_seq = next_seq - 1;
     if (two_write_queues_) {
       versions_->FetchAddLastAllocatedSequence(last_seq - seq);
@@ -994,8 +1144,12 @@ Status DBImpl::WriteRecoverableState() {
       for (uint64_t sub_batch_seq = seq + 1;
            sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
         uint64_t const no_log_num = 0;
+        // Unlock it since the callback might end up locking mutex. e.g.,
+        // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
+        mutex_.Unlock();
         status = recoverable_state_pre_release_callback_->Callback(
-            sub_batch_seq, !DISABLE_MEMTABLE, no_log_num);
+            sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1);
+        mutex_.Lock();
       }
     }
     if (status.ok()) {
@@ -1090,7 +1244,13 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
         cfds.push_back(cfd);
       }
     }
+    MaybeFlushStatsCF(&cfds);
+  }
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
   }
+
   for (const auto cfd : cfds) {
     cfd->Ref();
     status = SwitchMemtable(cfd, write_context);
@@ -1099,6 +1259,10 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
       break;
     }
   }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
   if (status.ok()) {
     if (immutable_db_options_.atomic_flush) {
       AssignAtomicFlushSeq(cfds);
@@ -1156,8 +1320,13 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
     if (cfd_picked != nullptr) {
       cfds.push_back(cfd_picked);
     }
+    MaybeFlushStatsCF(&cfds);
   }
 
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
   for (const auto cfd : cfds) {
     if (cfd->mem()->IsEmpty()) {
       continue;
@@ -1169,6 +1338,10 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
       break;
     }
   }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
   if (status.ok()) {
     if (immutable_db_options_.atomic_flush) {
       AssignAtomicFlushSeq(cfds);
@@ -1249,8 +1422,8 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
   }
   assert(!delayed || !write_options.no_slowdown);
   if (delayed) {
-    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS,
-                                           time_delayed);
+    default_cf_internal_stats_->AddDBStats(
+        InternalStats::kIntStatsWriteStallMicros, time_delayed);
     RecordTick(stats_, STALL_MICROS, time_delayed);
   }
 
@@ -1299,6 +1472,65 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
   return Status::OK();
 }
 
+void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
+  assert(cfds != nullptr);
+  if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) {
+    ColumnFamilyData* cfd_stats =
+        versions_->GetColumnFamilySet()->GetColumnFamily(
+            kPersistentStatsColumnFamilyName);
+    if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) {
+      for (ColumnFamilyData* cfd : *cfds) {
+        if (cfd == cfd_stats) {
+          // stats CF already included in cfds
+          return;
+        }
+      }
+      // force flush stats CF when its log number is less than all other CF's
+      // log numbers
+      bool force_flush_stats_cf = true;
+      for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+        if (loop_cfd == cfd_stats) {
+          continue;
+        }
+        if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+          force_flush_stats_cf = false;
+        }
+      }
+      if (force_flush_stats_cf) {
+        cfds->push_back(cfd_stats);
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "Force flushing stats CF with automated flush "
+                       "to avoid holding old logs");
+      }
+    }
+  }
+}
+
+Status DBImpl::TrimMemtableHistory(WriteContext* context) {
+  autovector<ColumnFamilyData*> cfds;
+  ColumnFamilyData* tmp_cfd;
+  while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) !=
+         nullptr) {
+    cfds.push_back(tmp_cfd);
+  }
+  for (auto& cfd : cfds) {
+    autovector<MemTable*> to_delete;
+    cfd->imm()->TrimHistory(&to_delete, cfd->mem()->ApproximateMemoryUsage());
+    for (auto m : to_delete) {
+      delete m;
+    }
+    context->superversion_context.NewSuperVersion();
+    assert(context->superversion_context.new_superversion.get() != nullptr);
+    cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
+
+    if (cfd->Unref()) {
+      delete cfd;
+      cfd = nullptr;
+    }
+  }
+  return Status::OK();
+}
+
 Status DBImpl::ScheduleFlushes(WriteContext* context) {
   autovector<ColumnFamilyData*> cfds;
   if (immutable_db_options_.atomic_flush) {
@@ -1312,8 +1544,14 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
     while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
       cfds.push_back(tmp_cfd);
     }
+    MaybeFlushStatsCF(&cfds);
   }
   Status status;
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+
   for (auto& cfd : cfds) {
     if (!cfd->mem()->IsEmpty()) {
       status = SwitchMemtable(cfd, context);
@@ -1326,6 +1564,11 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
       break;
     }
   }
+
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
   if (status.ok()) {
     if (immutable_db_options_.atomic_flush) {
       AssignAtomicFlushSeq(cfds);
@@ -1356,15 +1599,11 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
 
 // REQUIRES: mutex_ is held
 // REQUIRES: this thread is currently at the front of the writer queue
+// REQUIRES: this thread is currently at the front of the 2nd writer queue if
+// two_write_queues_ is true (This is to simplify the reasoning.)
 Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   mutex_.AssertHeld();
   WriteThread::Writer nonmem_w;
-  if (two_write_queues_) {
-    // SwitchMemtable is a rare event. To simply the reasoning, we make sure
-    // that there is no concurrent thread writing to WAL.
-    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
-  }
-
   std::unique_ptr<WritableFile> lfile;
   log::Writer* new_log = nullptr;
   MemTable* new_mem = nullptr;
@@ -1376,16 +1615,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
     return s;
   }
 
-  // In case of pipelined write is enabled, wait for all pending memtable
-  // writers.
-  if (immutable_db_options_.enable_pipelined_write) {
-    // Memtable writers may call DB::Get in case max_successive_merges > 0,
-    // which may lock mutex. Unlocking mutex here to avoid deadlock.
-    mutex_.Unlock();
-    write_thread_.WaitForMemTableWriters();
-    mutex_.Lock();
-  }
-
   // Attempt to switch to a new memtable and trigger flush of old.
   // Do this without holding the dbmutex lock.
   assert(versions_->prev_log_number() == 0);
@@ -1481,10 +1710,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
     error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
     // Read back bg_error in order to get the right severity
     s = error_handler_.GetBGError();
-
-    if (two_write_queues_) {
-      nonmem_write_thread_.ExitUnbatched(&nonmem_w);
-    }
     return s;
   }
 
@@ -1515,9 +1740,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   NotifyOnMemTableSealed(cfd, memtable_info);
   mutex_.Lock();
 #endif  // ROCKSDB_LITE
-  if (two_write_queues_) {
-    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
-  }
   return s;
 }
 
@@ -1547,14 +1769,30 @@ size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
 // can call if they wish
 Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
                const Slice& key, const Slice& value) {
-  // Pre-allocate size of write batch conservatively.
-  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
-  // and we allocate 11 extra bytes for key length, as well as value length.
-  WriteBatch batch(key.size() + value.size() + 24);
+  if (nullptr == opt.timestamp) {
+    // Pre-allocate size of write batch conservatively.
+    // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+    // and we allocate 11 extra bytes for key length, as well as value length.
+    WriteBatch batch(key.size() + value.size() + 24);
+    Status s = batch.Put(column_family, key, value);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(opt, &batch);
+  }
+  const Slice* ts = opt.timestamp;
+  assert(nullptr != ts);
+  size_t ts_sz = ts->size();
+  WriteBatch batch(key.size() + ts_sz + value.size() + 24, /*max_bytes=*/0,
+                   ts_sz);
   Status s = batch.Put(column_family, key, value);
   if (!s.ok()) {
     return s;
   }
+  s = batch.AssignTimestamp(*ts);
+  if (!s.ok()) {
+    return s;
+  }
   return Write(opt, &batch);
 }
 
diff --git a/db/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc
similarity index 65%
rename from db/db_secondary_test.cc
rename to db/db_impl/db_secondary_test.cc
index 47daf9fd8cc..6caff005eb4 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_impl/db_secondary_test.cc
@@ -7,11 +7,11 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/db_impl_secondary.h"
+#include "db/db_impl/db_impl_secondary.h"
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
@@ -237,6 +237,17 @@ TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
   };
 
   verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "new_foo_value_1"));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value_1", "new_bar_value");
 }
 
 TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
@@ -362,7 +373,7 @@ TEST_F(DBSecondaryTest, MissingTableFile) {
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->SetCallBack(
-      "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers",
+      "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers",
       [&](void* arg) {
         Status s = *reinterpret_cast<Status*>(arg);
         if (s.IsPathNotFound()) {
@@ -513,6 +524,256 @@ TEST_F(DBSecondaryTest, SwitchManifest) {
   ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
   range_scan_db();
 }
+
+// Here, "Snapshot" refers to the version edits written by
+// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after
+// switching from the old one.
+TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  ASSERT_OK(Put("0", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::string value;
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value0", value);
+
+  Reopen(options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+}
+
+TEST_F(DBSecondaryTest, SwitchWAL) {
+  const int kNumKeysPerMemtable = 1;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const auto& verify_db = [](DB* db1, DB* db2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
+    std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
+    it1->SeekToFirst();
+    it2->SeekToFirst();
+    for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+      ASSERT_EQ(it1->key(), it2->key());
+      ASSERT_EQ(it1->value(), it2->value());
+    }
+    ASSERT_FALSE(it1->Valid());
+    ASSERT_FALSE(it2->Valid());
+
+    for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+      std::string value;
+      ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
+      ASSERT_EQ(it1->value(), value);
+    }
+    for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+      std::string value;
+      ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
+      ASSERT_EQ(it2->value(), value);
+    }
+  };
+  for (int k = 0; k != 16; ++k) {
+    ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), db_secondary_);
+  }
+}
+
+TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
+  const int kNumKeysPerMemtable = 1;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  const std::string kCFName1 = "pikachu";
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+  CreateAndReopenWithCF({kCFName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCFName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  const auto& verify_db = [](DB* db1,
+                             const std::vector<ColumnFamilyHandle*>& handles1,
+                             DB* db2,
+                             const std::vector<ColumnFamilyHandle*>& handles2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    ASSERT_EQ(handles1.size(), handles2.size());
+    for (size_t i = 0; i != handles1.size(); ++i) {
+      std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
+      std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
+      it1->SeekToFirst();
+      it2->SeekToFirst();
+      for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+        ASSERT_EQ(it1->key(), it2->key());
+        ASSERT_EQ(it1->value(), it2->value());
+      }
+      ASSERT_FALSE(it1->Valid());
+      ASSERT_FALSE(it2->Valid());
+
+      for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+        std::string value;
+        ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
+        ASSERT_EQ(it1->value(), value);
+      }
+      for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+        std::string value;
+        ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
+        ASSERT_EQ(it2->value(), value);
+      }
+    }
+  };
+  for (int k = 0; k != 8; ++k) {
+    ASSERT_OK(
+        Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(
+        Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+    TEST_SYNC_POINT(
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+    SyncPoint::GetInstance()->ClearTrace();
+  }
+}
+
+TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
+  const int kNumKeysPerMemtable = 16;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  WriteOptions write_opts;
+  WriteBatch wb;
+  wb.Put("key0", "value0");
+  wb.Put("key1", "value1");
+  ASSERT_OK(dbfull()->Write(write_opts, &wb));
+  ReadOptions read_opts;
+  std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
+  iter2->Seek("key0");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("value0", iter2->value());
+  iter2->Seek("key1");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("value1", iter2->value());
+
+  {
+    WriteBatch wb1;
+    wb1.Put("key0", "value01");
+    wb1.Put("key1", "value11");
+    ASSERT_OK(dbfull()->Write(write_opts, &wb1));
+  }
+
+  {
+    WriteBatch wb2;
+    wb2.Put("key0", "new_value0");
+    wb2.Delete("key1");
+    ASSERT_OK(dbfull()->Write(write_opts, &wb2));
+  }
+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
+  // iter3 should not see value01 and value11 at all.
+  iter3->Seek("key0");
+  ASSERT_TRUE(iter3->Valid());
+  ASSERT_EQ("new_value0", iter3->value());
+  iter3->Seek("key1");
+  ASSERT_FALSE(iter3->Valid());
+}
+
+TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
+  bool called = false;
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        called = true;
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+        "DBImpl::CheckConsistency:BeforeGetFileSize"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("a", "value0"));
+  ASSERT_OK(Put("c", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "value1"));
+  ASSERT_OK(Put("d", "value1"));
+  ASSERT_OK(Flush());
+  port::Thread thread([this]() {
+    Options opts;
+    opts.env = env_;
+    opts.max_open_files = -1;
+    OpenSecondary(opts);
+  });
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  thread.join();
+  ASSERT_TRUE(called);
+}
 #endif  //! ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc
index 31050d20a29..7ab7e3337aa 100644
--- a/db/db_info_dumper.cc
+++ b/db/db_info_dumper.cc
@@ -3,20 +3,16 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "db/db_info_dumper.h"
 
-#include <inttypes.h>
 #include <stdio.h>
-#include <string>
 #include <algorithm>
+#include <cinttypes>
+#include <string>
 #include <vector>
 
+#include "file/filename.h"
 #include "rocksdb/env.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 43a56af78c7..b2675c520a6 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -16,6 +16,9 @@
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -23,12 +26,9 @@
 #include "rocksdb/options.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
-#include "util/arena.h"
-#include "util/filename.h"
-#include "util/logging.h"
+#include "trace_replay/trace_replay.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/trace_replay.h"
 #include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
@@ -46,318 +46,74 @@ static void DumpInternalIter(Iterator* iter) {
 }
 #endif
 
-// Memtables and sstables that make the DB representation contain
-// (userkey,seq,type) => uservalue entries.  DBIter
-// combines multiple entries for the same userkey found in the DB
-// representation into a single entry while accounting for sequence
-// numbers, deletion markers, overwrites, etc.
-class DBIter final: public Iterator {
- public:
-  // The following is grossly complicated. TODO: clean it up
-  // Which direction is the iterator currently moving?
-  // (1) When moving forward:
-  //   (1a) if current_entry_is_merged_ = false, the internal iterator is
-  //        positioned at the exact entry that yields this->key(), this->value()
-  //   (1b) if current_entry_is_merged_ = true, the internal iterator is
-  //        positioned immediately after the last entry that contributed to the
-  //        current this->value(). That entry may or may not have key equal to
-  //        this->key().
-  // (2) When moving backwards, the internal iterator is positioned
-  //     just before all entries whose user key == this->key().
-  enum Direction {
-    kForward,
-    kReverse
-  };
-
-  // LocalStatistics contain Statistics counters that will be aggregated per
-  // each iterator instance and then will be sent to the global statistics when
-  // the iterator is destroyed.
-  //
-  // The purpose of this approach is to avoid perf regression happening
-  // when multiple threads bump the atomic counters from a DBIter::Next().
-  struct LocalStatistics {
-    explicit LocalStatistics() { ResetCounters(); }
-
-    void ResetCounters() {
-      next_count_ = 0;
-      next_found_count_ = 0;
-      prev_count_ = 0;
-      prev_found_count_ = 0;
-      bytes_read_ = 0;
-      skip_count_ = 0;
-    }
-
-    void BumpGlobalStatistics(Statistics* global_statistics) {
-      RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
-      RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
-      RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
-      RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
-      RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
-      RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
-      PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
-      ResetCounters();
-    }
-
-    // Map to Tickers::NUMBER_DB_NEXT
-    uint64_t next_count_;
-    // Map to Tickers::NUMBER_DB_NEXT_FOUND
-    uint64_t next_found_count_;
-    // Map to Tickers::NUMBER_DB_PREV
-    uint64_t prev_count_;
-    // Map to Tickers::NUMBER_DB_PREV_FOUND
-    uint64_t prev_found_count_;
-    // Map to Tickers::ITER_BYTES_READ
-    uint64_t bytes_read_;
-    // Map to Tickers::NUMBER_ITER_SKIP
-    uint64_t skip_count_;
-  };
-
-  DBIter(Env* _env, const ReadOptions& read_options,
-         const ImmutableCFOptions& cf_options,
-         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
-         InternalIterator* iter, SequenceNumber s, bool arena_mode,
-         uint64_t max_sequential_skip_in_iterations,
-         ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-         bool allow_blob)
-      : env_(_env),
-        logger_(cf_options.info_log),
-        user_comparator_(cmp),
-        merge_operator_(cf_options.merge_operator),
-        iter_(iter),
-        read_callback_(read_callback),
-        sequence_(s),
-        statistics_(cf_options.statistics),
-        num_internal_keys_skipped_(0),
-        iterate_lower_bound_(read_options.iterate_lower_bound),
-        iterate_upper_bound_(read_options.iterate_upper_bound),
-        direction_(kForward),
-        valid_(false),
-        current_entry_is_merged_(false),
-        prefix_same_as_start_(read_options.prefix_same_as_start),
-        pin_thru_lifetime_(read_options.pin_data),
-        total_order_seek_(read_options.total_order_seek),
-        allow_blob_(allow_blob),
-        is_blob_(false),
-        arena_mode_(arena_mode),
-        range_del_agg_(&cf_options.internal_comparator, s),
-        db_impl_(db_impl),
-        cfd_(cfd),
-        start_seqnum_(read_options.iter_start_seqnum) {
-    RecordTick(statistics_, NO_ITERATOR_CREATED);
-    prefix_extractor_ = mutable_cf_options.prefix_extractor.get();
-    max_skip_ = max_sequential_skip_in_iterations;
-    max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
-    if (pin_thru_lifetime_) {
-      pinned_iters_mgr_.StartPinning();
-    }
-    if (iter_.iter()) {
-      iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
-    }
-  }
-  ~DBIter() override {
-    // Release pinned data if any
-    if (pinned_iters_mgr_.PinningEnabled()) {
-      pinned_iters_mgr_.ReleasePinnedData();
-    }
-    RecordTick(statistics_, NO_ITERATOR_DELETED);
-    ResetInternalKeysSkippedCounter();
-    local_stats_.BumpGlobalStatistics(statistics_);
-    iter_.DeleteIter(arena_mode_);
-  }
-  virtual void SetIter(InternalIterator* iter) {
-    assert(iter_.iter() == nullptr);
-    iter_.Set(iter);
+DBIter::DBIter(Env* _env, const ReadOptions& read_options,
+               const ImmutableCFOptions& cf_options,
+               const MutableCFOptions& mutable_cf_options,
+               const Comparator* cmp, InternalIterator* iter, SequenceNumber s,
+               bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+               ReadCallback* read_callback, DBImpl* db_impl,
+               ColumnFamilyData* cfd, bool allow_blob)
+    : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+      env_(_env),
+      logger_(cf_options.info_log),
+      user_comparator_(cmp),
+      merge_operator_(cf_options.merge_operator),
+      iter_(iter),
+      read_callback_(read_callback),
+      sequence_(s),
+      statistics_(cf_options.statistics),
+      num_internal_keys_skipped_(0),
+      iterate_lower_bound_(read_options.iterate_lower_bound),
+      iterate_upper_bound_(read_options.iterate_upper_bound),
+      direction_(kForward),
+      valid_(false),
+      current_entry_is_merged_(false),
+      is_key_seqnum_zero_(false),
+      prefix_same_as_start_(mutable_cf_options.prefix_extractor
+                                ? read_options.prefix_same_as_start
+                                : false),
+      pin_thru_lifetime_(read_options.pin_data),
+      total_order_seek_(read_options.total_order_seek),
+      allow_blob_(allow_blob),
+      is_blob_(false),
+      arena_mode_(arena_mode),
+      range_del_agg_(&cf_options.internal_comparator, s),
+      db_impl_(db_impl),
+      cfd_(cfd),
+      start_seqnum_(read_options.iter_start_seqnum) {
+  RecordTick(statistics_, NO_ITERATOR_CREATED);
+  max_skip_ = max_sequential_skip_in_iterations;
+  max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
+  if (pin_thru_lifetime_) {
+    pinned_iters_mgr_.StartPinning();
+  }
+  if (iter_.iter()) {
     iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
   }
-  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
-    return &range_del_agg_;
-  }
-
-  bool Valid() const override { return valid_; }
-  Slice key() const override {
-    assert(valid_);
-    if(start_seqnum_ > 0) {
-      return saved_key_.GetInternalKey();
-    } else {
-      return saved_key_.GetUserKey();
-    }
-  }
-  Slice value() const override {
-    assert(valid_);
-    if (current_entry_is_merged_) {
-      // If pinned_value_ is set then the result of merge operator is one of
-      // the merge operands and we should return it.
-      return pinned_value_.data() ? pinned_value_ : saved_value_;
-    } else if (direction_ == kReverse) {
-      return pinned_value_;
-    } else {
-      return iter_.value();
-    }
-  }
-  Status status() const override {
-    if (status_.ok()) {
-      return iter_.status();
-    } else {
-      assert(!valid_);
-      return status_;
-    }
-  }
-  bool IsBlob() const {
-    assert(valid_ && (allow_blob_ || !is_blob_));
-    return is_blob_;
-  }
-
-  Status GetProperty(std::string prop_name, std::string* prop) override {
-    if (prop == nullptr) {
-      return Status::InvalidArgument("prop is nullptr");
-    }
-    if (prop_name == "rocksdb.iterator.super-version-number") {
-      // First try to pass the value returned from inner iterator.
-      return iter_.iter()->GetProperty(prop_name, prop);
-    } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
-      if (valid_) {
-        *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
-      } else {
-        *prop = "Iterator is not valid.";
-      }
-      return Status::OK();
-    } else if (prop_name == "rocksdb.iterator.internal-key") {
-      *prop = saved_key_.GetUserKey().ToString();
-      return Status::OK();
-    }
-    return Status::InvalidArgument("Unidentified property.");
-  }
-
-  inline void Next() final override;
-  inline void Prev() final override;
-  inline void Seek(const Slice& target) final override;
-  inline void SeekForPrev(const Slice& target) final override;
-  inline void SeekToFirst() final override;
-  inline void SeekToLast() final override;
-  Env* env() { return env_; }
-  void set_sequence(uint64_t s) {
-    sequence_ = s;
-    if (read_callback_) {
-      read_callback_->Refresh(s);
-    }
-  }
-  void set_valid(bool v) { valid_ = v; }
-
- private:
-  // For all methods in this block:
-  // PRE: iter_->Valid() && status_.ok()
-  // Return false if there was an error, and status() is non-ok, valid_ = false;
-  // in this case callers would usually stop what they were doing and return.
-  bool ReverseToForward();
-  bool ReverseToBackward();
-  bool FindValueForCurrentKey();
-  bool FindValueForCurrentKeyUsingSeek();
-  bool FindUserKeyBeforeSavedKey();
-  inline bool FindNextUserEntry(bool skipping, bool prefix_check);
-  inline bool FindNextUserEntryInternal(bool skipping, bool prefix_check);
-  bool ParseKey(ParsedInternalKey* key);
-  bool MergeValuesNewToOld();
-
-  void PrevInternal();
-  bool TooManyInternalKeysSkipped(bool increment = true);
-  inline bool IsVisible(SequenceNumber sequence);
-
-  // CanReseekToSkip() returns whether the iterator can use the optimization
-  // where it reseek by sequence number to get the next key when there are too
-  // many versions. This is disabled for write unprepared because seeking to
-  // sequence number does not guarantee that it is visible.
-  inline bool CanReseekToSkip();
-
-  // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
-  // is called
-  void TempPinData() {
-    if (!pin_thru_lifetime_) {
-      pinned_iters_mgr_.StartPinning();
-    }
-  }
-
-  // Release blocks pinned by TempPinData()
-  void ReleaseTempPinnedData() {
-    if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
-      pinned_iters_mgr_.ReleasePinnedData();
-    }
-  }
+}
 
-  inline void ClearSavedValue() {
-    if (saved_value_.capacity() > 1048576) {
-      std::string empty;
-      swap(empty, saved_value_);
-    } else {
-      saved_value_.clear();
-    }
+Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
+  if (prop == nullptr) {
+    return Status::InvalidArgument("prop is nullptr");
   }
-
-  inline void ResetInternalKeysSkippedCounter() {
-    local_stats_.skip_count_ += num_internal_keys_skipped_;
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    return iter_.iter()->GetProperty(prop_name, prop);
+  } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
     if (valid_) {
-      local_stats_.skip_count_--;
+      *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
+    } else {
+      *prop = "Iterator is not valid.";
     }
-    num_internal_keys_skipped_ = 0;
+    return Status::OK();
+  } else if (prop_name == "rocksdb.iterator.internal-key") {
+    *prop = saved_key_.GetUserKey().ToString();
+    return Status::OK();
   }
+  return Status::InvalidArgument("Unidentified property.");
+}
 
-  const SliceTransform* prefix_extractor_;
-  Env* const env_;
-  Logger* logger_;
-  UserComparatorWrapper user_comparator_;
-  const MergeOperator* const merge_operator_;
-  IteratorWrapper iter_;
-  ReadCallback* read_callback_;
-  // Max visible sequence number. It is normally the snapshot seq unless we have
-  // uncommitted data in db as in WriteUnCommitted.
-  SequenceNumber sequence_;
-
-  IterKey saved_key_;
-  // Reusable internal key data structure. This is only used inside one function
-  // and should not be used across functions. Reusing this object can reduce
-  // overhead of calling construction of the function if creating it each time.
-  ParsedInternalKey ikey_;
-  std::string saved_value_;
-  Slice pinned_value_;
-  // for prefix seek mode to support prev()
-  Statistics* statistics_;
-  uint64_t max_skip_;
-  uint64_t max_skippable_internal_keys_;
-  uint64_t num_internal_keys_skipped_;
-  const Slice* iterate_lower_bound_;
-  const Slice* iterate_upper_bound_;
-
-  IterKey prefix_start_buf_;
-
-  Status status_;
-  Slice prefix_start_key_;
-  Direction direction_;
-  bool valid_;
-  bool current_entry_is_merged_;
-  const bool prefix_same_as_start_;
-  // Means that we will pin all data blocks we read as long the Iterator
-  // is not deleted, will be true if ReadOptions::pin_data is true
-  const bool pin_thru_lifetime_;
-  const bool total_order_seek_;
-  bool allow_blob_;
-  bool is_blob_;
-  bool arena_mode_;
-  // List of operands for merge operator.
-  MergeContext merge_context_;
-  ReadRangeDelAggregator range_del_agg_;
-  LocalStatistics local_stats_;
-  PinnedIteratorsManager pinned_iters_mgr_;
-  DBImpl* db_impl_;
-  ColumnFamilyData* cfd_;
-  // for diff snapshots we want the lower bound on the seqnum;
-  // if this value > 0 iterator will return internal keys
-  SequenceNumber start_seqnum_;
-
-  // No copying allowed
-  DBIter(const DBIter&);
-  void operator=(const DBIter&);
-};
-
-inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+bool DBIter::ParseKey(ParsedInternalKey* ikey) {
   if (!ParseInternalKey(iter_.key(), ikey)) {
     status_ = Status::Corruption("corrupted internal key in DBIter");
     valid_ = false;
@@ -381,6 +137,7 @@ void DBIter::Next() {
   num_internal_keys_skipped_ = 0;
   bool ok = true;
   if (direction_ == kReverse) {
+    is_key_seqnum_zero_ = false;
     if (!ReverseToForward()) {
       ok = false;
     }
@@ -397,9 +154,15 @@ void DBIter::Next() {
 
   local_stats_.next_count_++;
   if (ok && iter_.Valid()) {
+    Slice prefix;
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      prefix = prefix_.GetUserKey();
+    }
     FindNextUserEntry(true /* skipping the current user key */,
-                      prefix_same_as_start_);
+                      prefix_same_as_start_ ? &prefix : nullptr);
   } else {
+    is_key_seqnum_zero_ = false;
     valid_ = false;
   }
   if (statistics_ != nullptr && valid_) {
@@ -408,7 +171,7 @@ void DBIter::Next() {
   }
 }
 
-// PRE: saved_key_ has the current user key if skipping
+// PRE: saved_key_ has the current user key if skipping_saved_key
 // POST: saved_key_ should have the next user key if valid_,
 //       if the current entry is a result of merge
 //           current_entry_is_merged_ => true
@@ -418,17 +181,17 @@ void DBIter::Next() {
 //       a delete marker or a sequence number higher than sequence_
 //       saved_key_ MUST have a proper user_key before calling this function
 //
-// The prefix_check parameter controls whether we check the iterated
-// keys against the prefix of the seeked key. Set to false when
-// performing a seek without a key (e.g. SeekToFirst). Set to
-// prefix_same_as_start_ for other iterations.
-inline bool DBIter::FindNextUserEntry(bool skipping, bool prefix_check) {
+// The prefix parameter, if not null, indicates that we need to iterator
+// within the prefix, and the iterator needs to be made invalid, if no
+// more entry for the prefix can be found.
+bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) {
   PERF_TIMER_GUARD(find_next_user_entry_time);
-  return FindNextUserEntryInternal(skipping, prefix_check);
+  return FindNextUserEntryInternal(skipping_saved_key, prefix);
 }
 
 // Actual implementation of DBIter::FindNextUserEntry()
-inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
+bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
+                                       const Slice* prefix) {
   // Loop until we hit an acceptable entry to yield
   assert(iter_.Valid());
   assert(status_.ok());
@@ -437,31 +200,46 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
 
   // How many times in a row we have skipped an entry with user key less than
   // or equal to saved_key_. We could skip these entries either because
-  // sequence numbers were too high or because skipping = true.
+  // sequence numbers were too high or because skipping_saved_key = true.
   // What saved_key_ contains throughout this method:
-  //  - if skipping        : saved_key_ contains the key that we need to skip,
+  //  - if skipping_saved_key        : saved_key_ contains the key that we need
+  //  to skip,
   //                         and we haven't seen any keys greater than that,
   //  - if num_skipped > 0 : saved_key_ contains the key that we have skipped
   //                         num_skipped times, and we haven't seen any keys
   //                         greater than that,
   //  - none of the above  : saved_key_ can contain anything, it doesn't matter.
   uint64_t num_skipped = 0;
+  // For write unprepared, the target sequence number in reseek could be larger
+  // than the snapshot, and thus needs to be skipped again. This could result in
+  // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
+  // to one.
+  bool reseek_done = false;
 
   is_blob_ = false;
 
   do {
+    // Will update is_key_seqnum_zero_ as soon as we parsed the current key
+    // but we need to save the previous value to be used in the loop.
+    bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
     if (!ParseKey(&ikey_)) {
+      is_key_seqnum_zero_ = false;
       return false;
     }
 
-    if (iterate_upper_bound_ != nullptr &&
+    is_key_seqnum_zero_ = (ikey_.sequence == 0);
+
+    assert(iterate_upper_bound_ == nullptr || iter_.MayBeOutOfUpperBound() ||
+           user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) < 0);
+    if (iterate_upper_bound_ != nullptr && iter_.MayBeOutOfUpperBound() &&
         user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
       break;
     }
 
-    if (prefix_extractor_ && prefix_check &&
-        prefix_extractor_->Transform(ikey_.user_key)
-                .compare(prefix_start_key_) != 0) {
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
+        prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) {
+      assert(prefix_same_as_start_);
       break;
     }
 
@@ -470,12 +248,21 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
     }
 
     if (IsVisible(ikey_.sequence)) {
-      if (skipping && user_comparator_.Compare(ikey_.user_key,
-                                               saved_key_.GetUserKey()) <= 0) {
+      // If the previous entry is of seqnum 0, the current entry will not
+      // possibly be skipped. This condition can potentially be relaxed to
+      // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
+      // prone to bugs causing the same user key with the same sequence number.
+      if (!is_prev_key_seqnum_zero && skipping_saved_key &&
+          user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey()) <=
+              0) {
         num_skipped++;  // skip this entry
         PERF_COUNTER_ADD(internal_key_skipped_count, 1);
       } else {
+        assert(!skipping_saved_key ||
+               user_comparator_.Compare(ikey_.user_key,
+                                        saved_key_.GetUserKey()) > 0);
         num_skipped = 0;
+        reseek_done = false;
         switch (ikey_.type) {
           case kTypeDeletion:
           case kTypeSingleDeletion:
@@ -494,7 +281,7 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
               saved_key_.SetUserKey(
                   ikey_.user_key, !pin_thru_lifetime_ ||
                                       !iter_.iter()->IsKeyPinned() /* copy */);
-              skipping = true;
+              skipping_saved_key = true;
               PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
             }
             break;
@@ -512,12 +299,12 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
                 return true;
               } else {
                 // this key and all previous versions shouldn't be included,
-                // skipping
+                // skipping_saved_key
                 saved_key_.SetUserKey(
                     ikey_.user_key,
                     !pin_thru_lifetime_ ||
                         !iter_.iter()->IsKeyPinned() /* copy */);
-                skipping = true;
+                skipping_saved_key = true;
               }
             } else {
               saved_key_.SetUserKey(
@@ -527,8 +314,9 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
                       ikey_, RangeDelPositioningMode::kForwardTraversal)) {
                 // Arrange to skip all upcoming entries for this key since
                 // they are hidden by this deletion.
-                skipping = true;
+                skipping_saved_key = true;
                 num_skipped = 0;
+                reseek_done = false;
                 PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
               } else if (ikey_.type == kTypeBlobIndex) {
                 if (!allow_blob_) {
@@ -557,8 +345,9 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
                     ikey_, RangeDelPositioningMode::kForwardTraversal)) {
               // Arrange to skip all upcoming entries for this key since
               // they are hidden by this deletion.
-              skipping = true;
+              skipping_saved_key = true;
               num_skipped = 0;
+              reseek_done = false;
               PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
             } else {
               // By now, we are sure the current ikey is going to yield a
@@ -581,30 +370,40 @@ inline bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check)
       // to seek to the target sequence number.
       int cmp =
           user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey());
-      if (cmp == 0 || (skipping && cmp <= 0)) {
+      if (cmp == 0 || (skipping_saved_key && cmp <= 0)) {
         num_skipped++;
       } else {
         saved_key_.SetUserKey(
             ikey_.user_key,
             !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
-        skipping = false;
+        skipping_saved_key = false;
         num_skipped = 0;
+        reseek_done = false;
       }
     }
 
     // If we have sequentially iterated via numerous equal keys, then it's
     // better to seek so that we can avoid too many key comparisons.
-    if (num_skipped > max_skip_ && CanReseekToSkip()) {
+    //
+    // To avoid infinite loops, do not reseek if we have already attempted to
+    // reseek previously.
+    //
+    // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
+    // than it does not make sense to reseek as we would actually land further
+    // away from the desired key. There is opportunity for optimization here.
+    if (num_skipped > max_skip_ && !reseek_done) {
+      is_key_seqnum_zero_ = false;
       num_skipped = 0;
+      reseek_done = true;
       std::string last_key;
-      if (skipping) {
+      if (skipping_saved_key) {
         // We're looking for the next user-key but all we see are the same
         // user-key with decreasing sequence numbers. Fast forward to
         // sequence number 0 and type deletion (the smallest type).
         AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
                                                        0, kTypeDeletion));
-        // Don't set skipping = false because we may still see more user-keys
-        // equal to saved_key_.
+        // Don't set skipping_saved_key = false because we may still see more
+        // user-keys equal to saved_key_.
       } else {
         // We saw multiple entries with this user key and sequence numbers
         // higher than sequence_. Fast forward to sequence_.
@@ -745,8 +544,14 @@ void DBIter::Prev() {
     }
   }
   if (ok) {
-    PrevInternal();
+    Slice prefix;
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      prefix = prefix_.GetUserKey();
+    }
+    PrevInternal(prefix_same_as_start_ ? &prefix : nullptr);
   }
+
   if (statistics_ != nullptr) {
     local_stats_.prev_count_++;
     if (valid_) {
@@ -824,21 +629,26 @@ bool DBIter::ReverseToBackward() {
   return FindUserKeyBeforeSavedKey();
 }
 
-void DBIter::PrevInternal() {
+void DBIter::PrevInternal(const Slice* prefix) {
   while (iter_.Valid()) {
     saved_key_.SetUserKey(
         ExtractUserKey(iter_.key()),
         !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
 
-    if (prefix_extractor_ && prefix_same_as_start_ &&
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
         prefix_extractor_->Transform(saved_key_.GetUserKey())
-                .compare(prefix_start_key_) != 0) {
+                .compare(*prefix) != 0) {
+      assert(prefix_same_as_start_);
       // Current key does not have the same prefix as start
       valid_ = false;
       return;
     }
 
-    if (iterate_lower_bound_ != nullptr &&
+    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
+           user_comparator_.Compare(saved_key_.GetUserKey(),
+                                    *iterate_lower_bound_) >= 0);
+    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
         user_comparator_.Compare(saved_key_.GetUserKey(),
                                  *iterate_lower_bound_) < 0) {
       // We've iterated earlier than the user-specified lower bound.
@@ -911,7 +721,7 @@ bool DBIter::FindValueForCurrentKey() {
     // This user key has lots of entries.
     // We're going from old to new, and it's taking too long. Let's do a Seek()
     // and go from new to old. This helps when a key was overwritten many times.
-    if (num_skipped >= max_skip_ && CanReseekToSkip()) {
+    if (num_skipped >= max_skip_) {
       return FindValueForCurrentKeyUsingSeek();
     }
 
@@ -1046,6 +856,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   // In case read_callback presents, the value we seek to may not be visible.
   // Find the next value that's visible.
   ParsedInternalKey ikey;
+  is_blob_ = false;
   while (true) {
     if (!iter_.Valid()) {
       valid_ = false;
@@ -1087,6 +898,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
     assert(iter_.iter()->IsValuePinned());
     pinned_value_ = iter_.value();
+    is_blob_ = (ikey.type == kTypeBlobIndex);
     valid_ = true;
     return true;
   }
@@ -1208,7 +1020,7 @@ bool DBIter::FindUserKeyBeforeSavedKey() {
       PERF_COUNTER_ADD(internal_key_skipped_count, 1);
     }
 
-    if (num_skipped >= max_skip_ && CanReseekToSkip()) {
+    if (num_skipped >= max_skip_) {
       num_skipped = 0;
       IterKey last_key;
       last_key.SetInternalKey(ParsedInternalKey(
@@ -1255,20 +1067,39 @@ bool DBIter::IsVisible(SequenceNumber sequence) {
   }
 }
 
-bool DBIter::CanReseekToSkip() {
-  return read_callback_ == nullptr || read_callback_->CanReseekToSkip();
+void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
+  is_key_seqnum_zero_ = false;
+  SequenceNumber seq = sequence_;
+  saved_key_.Clear();
+  saved_key_.SetInternalKey(target, seq);
+
+  if (iterate_lower_bound_ != nullptr &&
+      user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) <
+          0) {
+    // Seek key is smaller than the lower bound.
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_lower_bound_, seq);
+  }
+}
+
+void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
+  is_key_seqnum_zero_ = false;
+  saved_key_.Clear();
+  // now saved_key is used to store internal key.
+  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
+                            kValueTypeForSeekForPrev);
+
+  if (iterate_upper_bound_ != nullptr &&
+      user_comparator_.Compare(saved_key_.GetUserKey(),
+                               *iterate_upper_bound_) >= 0) {
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
+  }
 }
 
 void DBIter::Seek(const Slice& target) {
   PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
   StopWatch sw(env_, statistics_, DB_SEEK);
-  status_ = Status::OK();
-  ReleaseTempPinnedData();
-  ResetInternalKeysSkippedCounter();
-
-  SequenceNumber seq = sequence_;
-  saved_key_.Clear();
-  saved_key_.SetInternalKey(target, seq);
 
 #ifndef ROCKSDB_LITE
   if (db_impl_ != nullptr && cfd_ != nullptr) {
@@ -1276,101 +1107,112 @@ void DBIter::Seek(const Slice& target) {
   }
 #endif  // ROCKSDB_LITE
 
-  if (iterate_lower_bound_ != nullptr &&
-      user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) <
-          0) {
-    saved_key_.Clear();
-    saved_key_.SetInternalKey(*iterate_lower_bound_, seq);
-  }
+  status_ = Status::OK();
+  ReleaseTempPinnedData();
+  ResetInternalKeysSkippedCounter();
 
+  // Seek the inner iterator based on the target key.
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
+
+    SetSavedKeyToSeekTarget(target);
     iter_.Seek(saved_key_.GetInternalKey());
+
     range_del_agg_.InvalidateRangeDelMapPositions();
+    RecordTick(statistics_, NUMBER_DB_SEEK);
   }
-  RecordTick(statistics_, NUMBER_DB_SEEK);
-  if (iter_.Valid()) {
-    if (prefix_extractor_ && prefix_same_as_start_) {
-      prefix_start_key_ = prefix_extractor_->Transform(target);
-    }
-    direction_ = kForward;
-    ClearSavedValue();
-    FindNextUserEntry(false /* not skipping */, prefix_same_as_start_);
-    if (!valid_) {
-      prefix_start_key_.clear();
-    }
-    if (statistics_ != nullptr) {
-      if (valid_) {
-        // Decrement since we don't want to count this key as skipped
-        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
-        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
-        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
-      }
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
+  }
+  direction_ = kForward;
+
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the next key that is visible to the user.
+  //
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix;
+    target_prefix = prefix_extractor_->Transform(target);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      &target_prefix /* prefix */);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Prev() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
     }
   } else {
-    valid_ = false;
+    FindNextUserEntry(false /* not skipping saved_key */, nullptr);
+  }
+  if (!valid_) {
+    return;
   }
 
-  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
-    prefix_start_buf_.SetUserKey(prefix_start_key_);
-    prefix_start_key_ = prefix_start_buf_.GetUserKey();
+  // Updating stats and perf context counters.
+  if (statistics_ != nullptr) {
+    // Decrement since we don't want to count this key as skipped
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
   }
+  PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
 }
 
 void DBIter::SeekForPrev(const Slice& target) {
   PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
   StopWatch sw(env_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
+  }
+#endif  // ROCKSDB_LITE
+
   status_ = Status::OK();
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
-  saved_key_.Clear();
-  // now saved_key is used to store internal key.
-  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
-                            kValueTypeForSeekForPrev);
-
-  if (iterate_upper_bound_ != nullptr &&
-      user_comparator_.Compare(saved_key_.GetUserKey(),
-                               *iterate_upper_bound_) >= 0) {
-    saved_key_.Clear();
-    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
-  }
 
+  // Seek the inner iterator based on the target key.
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
+    SetSavedKeyToSeekForPrevTarget(target);
     iter_.SeekForPrev(saved_key_.GetInternalKey());
     range_del_agg_.InvalidateRangeDelMapPositions();
+    RecordTick(statistics_, NUMBER_DB_SEEK);
   }
-
-#ifndef ROCKSDB_LITE
-  if (db_impl_ != nullptr && cfd_ != nullptr) {
-    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
   }
-#endif  // ROCKSDB_LITE
+  direction_ = kReverse;
 
-  RecordTick(statistics_, NUMBER_DB_SEEK);
-  if (iter_.Valid()) {
-    if (prefix_extractor_ && prefix_same_as_start_) {
-      prefix_start_key_ = prefix_extractor_->Transform(target);
-    }
-    direction_ = kReverse;
-    ClearSavedValue();
-    PrevInternal();
-    if (!valid_) {
-      prefix_start_key_.clear();
-    }
-    if (statistics_ != nullptr) {
-      if (valid_) {
-        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
-        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
-        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
-      }
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the first key that is visible to the user in the
+  // backward direction.
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix;
+    target_prefix = prefix_extractor_->Transform(target);
+    PrevInternal(&target_prefix);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Prev() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
     }
   } else {
-    valid_ = false;
+    PrevInternal(nullptr);
   }
-  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
-    prefix_start_buf_.SetUserKey(prefix_start_key_);
-    prefix_start_key_ = prefix_start_buf_.GetUserKey();
+
+  // Report stats and perf context.
+  if (statistics_ != nullptr && valid_) {
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+    PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
   }
 }
 
@@ -1390,6 +1232,7 @@ void DBIter::SeekToFirst() {
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
   ClearSavedValue();
+  is_key_seqnum_zero_ = false;
 
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
@@ -1402,7 +1245,8 @@ void DBIter::SeekToFirst() {
     saved_key_.SetUserKey(
         ExtractUserKey(iter_.key()),
         !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
-    FindNextUserEntry(false /* not skipping */, false /* no prefix check */);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      nullptr /* no prefix check */);
     if (statistics_ != nullptr) {
       if (valid_) {
         RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
@@ -1413,10 +1257,9 @@ void DBIter::SeekToFirst() {
   } else {
     valid_ = false;
   }
-  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
-    prefix_start_buf_.SetUserKey(
-        prefix_extractor_->Transform(saved_key_.GetUserKey()));
-    prefix_start_key_ = prefix_start_buf_.GetUserKey();
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
   }
 }
 
@@ -1426,7 +1269,7 @@ void DBIter::SeekToLast() {
     SeekForPrev(*iterate_upper_bound_);
     if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) {
       ReleaseTempPinnedData();
-      PrevInternal();
+      PrevInternal(nullptr);
     }
     return;
   }
@@ -1442,13 +1285,14 @@ void DBIter::SeekToLast() {
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
   ClearSavedValue();
+  is_key_seqnum_zero_ = false;
 
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
     iter_.SeekToLast();
     range_del_agg_.InvalidateRangeDelMapPositions();
   }
-  PrevInternal();
+  PrevInternal(nullptr);
   if (statistics_ != nullptr) {
     RecordTick(statistics_, NUMBER_DB_SEEK);
     if (valid_) {
@@ -1457,10 +1301,9 @@ void DBIter::SeekToLast() {
       PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
     }
   }
-  if (valid_ && prefix_extractor_ && prefix_same_as_start_) {
-    prefix_start_buf_.SetUserKey(
-        prefix_extractor_->Transform(saved_key_.GetUserKey()));
-    prefix_start_key_ = prefix_start_buf_.GetUserKey();
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
   }
 }
 
@@ -1480,114 +1323,4 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
   return db_iter;
 }
 
-ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
-
-ReadRangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() {
-  return db_iter_->GetRangeDelAggregator();
-}
-
-void ArenaWrappedDBIter::SetIterUnderDBIter(InternalIterator* iter) {
-  static_cast<DBIter*>(db_iter_)->SetIter(iter);
-}
-
-inline bool ArenaWrappedDBIter::Valid() const { return db_iter_->Valid(); }
-inline void ArenaWrappedDBIter::SeekToFirst() { db_iter_->SeekToFirst(); }
-inline void ArenaWrappedDBIter::SeekToLast() { db_iter_->SeekToLast(); }
-inline void ArenaWrappedDBIter::Seek(const Slice& target) {
-  db_iter_->Seek(target);
-}
-inline void ArenaWrappedDBIter::SeekForPrev(const Slice& target) {
-  db_iter_->SeekForPrev(target);
-}
-inline void ArenaWrappedDBIter::Next() { db_iter_->Next(); }
-inline void ArenaWrappedDBIter::Prev() { db_iter_->Prev(); }
-inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); }
-inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); }
-inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); }
-bool ArenaWrappedDBIter::IsBlob() const { return db_iter_->IsBlob(); }
-inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
-                                              std::string* prop) {
-  if (prop_name == "rocksdb.iterator.super-version-number") {
-    // First try to pass the value returned from inner iterator.
-    if (!db_iter_->GetProperty(prop_name, prop).ok()) {
-      *prop = ToString(sv_number_);
-    }
-    return Status::OK();
-  }
-  return db_iter_->GetProperty(prop_name, prop);
-}
-
-void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
-                              const ImmutableCFOptions& cf_options,
-                              const MutableCFOptions& mutable_cf_options,
-                              const SequenceNumber& sequence,
-                              uint64_t max_sequential_skip_in_iteration,
-                              uint64_t version_number,
-                              ReadCallback* read_callback, DBImpl* db_impl,
-                              ColumnFamilyData* cfd, bool allow_blob,
-                              bool allow_refresh) {
-  auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
-                              cf_options.user_comparator, nullptr, sequence,
-                              true, max_sequential_skip_in_iteration,
-                              read_callback, db_impl, cfd, allow_blob);
-  sv_number_ = version_number;
-  allow_refresh_ = allow_refresh;
-}
-
-Status ArenaWrappedDBIter::Refresh() {
-  if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
-    return Status::NotSupported("Creating renew iterator is not allowed.");
-  }
-  assert(db_iter_ != nullptr);
-  // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
-  // correct behavior. Will be corrected automatically when we take a snapshot
-  // here for the case of WritePreparedTxnDB.
-  SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
-  uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
-  if (sv_number_ != cur_sv_number) {
-    Env* env = db_iter_->env();
-    db_iter_->~DBIter();
-    arena_.~Arena();
-    new (&arena_) Arena();
-
-    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex());
-    if (read_callback_) {
-      read_callback_->Refresh(latest_seq);
-    }
-    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
-         latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
-         cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
-         allow_refresh_);
-
-    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
-        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
-        latest_seq);
-    SetIterUnderDBIter(internal_iter);
-  } else {
-    db_iter_->set_sequence(latest_seq);
-    db_iter_->set_valid(false);
-  }
-  return Status::OK();
-}
-
-ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
-    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-    bool allow_blob, bool allow_refresh) {
-  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
-  iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
-             max_sequential_skip_in_iterations, version_number, read_callback,
-             db_impl, cfd, allow_blob, allow_refresh);
-  if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
-    iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback,
-                           allow_blob);
-  }
-
-  return iter;
-}
-
 }  // namespace rocksdb
diff --git a/db/db_iter.h b/db/db_iter.h
index a640f0296e5..e6e072c5051 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -10,22 +10,321 @@
 #pragma once
 #include <stdint.h>
 #include <string>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
+#include "memory/arena.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
-#include "util/arena.h"
+#include "table/iterator_wrapper.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
 
-class Arena;
-class DBIter;
+// This file declares the factory functions of DBIter, in its original form
+// or a wrapped form with class ArenaWrappedDBIter, which is defined here.
+// Class DBIter, which is declared and implemented inside db_iter.cc, is
+// a iterator that converts internal keys (yielded by an InternalIterator)
+// that were live at the specified sequence number into appropriate user
+// keys.
+// Each internal key is consist of a user key, a sequence number, and a value
+// type. DBIter deals with multiple key versions, tombstones, merge operands,
+// etc, and exposes an Iterator.
+// For example, DBIter may wrap following InternalIterator:
+//    user key: AAA  value: v3   seqno: 100    type: Put
+//    user key: AAA  value: v2   seqno: 97     type: Put
+//    user key: AAA  value: v1   seqno: 95     type: Put
+//    user key: BBB  value: v1   seqno: 90     type: Put
+//    user key: BBC  value: N/A  seqno: 98     type: Delete
+//    user key: BBC  value: v1   seqno: 95     type: Put
+// If the snapshot passed in is 102, then the DBIter is expected to
+// expose the following iterator:
+//    key: AAA  value: v3
+//    key: BBB  value: v1
+// If the snapshot passed in is 96, then it should expose:
+//    key: AAA  value: v1
+//    key: BBB  value: v1
+//    key: BBC  value: v1
+//
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter final : public Iterator {
+ public:
+  // The following is grossly complicated. TODO: clean it up
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward:
+  //   (1a) if current_entry_is_merged_ = false, the internal iterator is
+  //        positioned at the exact entry that yields this->key(), this->value()
+  //   (1b) if current_entry_is_merged_ = true, the internal iterator is
+  //        positioned immediately after the last entry that contributed to the
+  //        current this->value(). That entry may or may not have key equal to
+  //        this->key().
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction { kForward, kReverse };
+
+  // LocalStatistics contain Statistics counters that will be aggregated per
+  // each iterator instance and then will be sent to the global statistics when
+  // the iterator is destroyed.
+  //
+  // The purpose of this approach is to avoid perf regression happening
+  // when multiple threads bump the atomic counters from a DBIter::Next().
+  struct LocalStatistics {
+    explicit LocalStatistics() { ResetCounters(); }
+
+    void ResetCounters() {
+      next_count_ = 0;
+      next_found_count_ = 0;
+      prev_count_ = 0;
+      prev_found_count_ = 0;
+      bytes_read_ = 0;
+      skip_count_ = 0;
+    }
+
+    void BumpGlobalStatistics(Statistics* global_statistics) {
+      RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
+      RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
+      RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
+      RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
+      PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
+      ResetCounters();
+    }
+
+    // Map to Tickers::NUMBER_DB_NEXT
+    uint64_t next_count_;
+    // Map to Tickers::NUMBER_DB_NEXT_FOUND
+    uint64_t next_found_count_;
+    // Map to Tickers::NUMBER_DB_PREV
+    uint64_t prev_count_;
+    // Map to Tickers::NUMBER_DB_PREV_FOUND
+    uint64_t prev_found_count_;
+    // Map to Tickers::ITER_BYTES_READ
+    uint64_t bytes_read_;
+    // Map to Tickers::NUMBER_ITER_SKIP
+    uint64_t skip_count_;
+  };
+
+  DBIter(Env* _env, const ReadOptions& read_options,
+         const ImmutableCFOptions& cf_options,
+         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+         InternalIterator* iter, SequenceNumber s, bool arena_mode,
+         uint64_t max_sequential_skip_in_iterations,
+         ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+         bool allow_blob);
+
+  // No copying allowed
+  DBIter(const DBIter&) = delete;
+  void operator=(const DBIter&) = delete;
+
+  ~DBIter() override {
+    // Release pinned data if any
+    if (pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+    RecordTick(statistics_, NO_ITERATOR_DELETED);
+    ResetInternalKeysSkippedCounter();
+    local_stats_.BumpGlobalStatistics(statistics_);
+    iter_.DeleteIter(arena_mode_);
+  }
+  virtual void SetIter(InternalIterator* iter) {
+    assert(iter_.iter() == nullptr);
+    iter_.Set(iter);
+    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+  }
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
+    return &range_del_agg_;
+  }
+
+  bool Valid() const override { return valid_; }
+  Slice key() const override {
+    assert(valid_);
+    if (start_seqnum_ > 0) {
+      return saved_key_.GetInternalKey();
+    } else {
+      return saved_key_.GetUserKey();
+    }
+  }
+  Slice value() const override {
+    assert(valid_);
+    if (current_entry_is_merged_) {
+      // If pinned_value_ is set then the result of merge operator is one of
+      // the merge operands and we should return it.
+      return pinned_value_.data() ? pinned_value_ : saved_value_;
+    } else if (direction_ == kReverse) {
+      return pinned_value_;
+    } else {
+      return iter_.value();
+    }
+  }
+  Status status() const override {
+    if (status_.ok()) {
+      return iter_.status();
+    } else {
+      assert(!valid_);
+      return status_;
+    }
+  }
+  bool IsBlob() const {
+    assert(valid_ && (allow_blob_ || !is_blob_));
+    return is_blob_;
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  void Next() final override;
+  void Prev() final override;
+  void Seek(const Slice& target) final override;
+  void SeekForPrev(const Slice& target) final override;
+  void SeekToFirst() final override;
+  void SeekToLast() final override;
+  Env* env() { return env_; }
+  void set_sequence(uint64_t s) {
+    sequence_ = s;
+    if (read_callback_) {
+      read_callback_->Refresh(s);
+    }
+  }
+  void set_valid(bool v) { valid_ = v; }
+
+ private:
+  // For all methods in this block:
+  // PRE: iter_->Valid() && status_.ok()
+  // Return false if there was an error, and status() is non-ok, valid_ = false;
+  // in this case callers would usually stop what they were doing and return.
+  bool ReverseToForward();
+  bool ReverseToBackward();
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is smaller than iterator lower bound.
+  void SetSavedKeyToSeekTarget(const Slice& /*target*/);
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is larger than iterator upper bound.
+  void SetSavedKeyToSeekForPrevTarget(const Slice& /*target*/);
+  bool FindValueForCurrentKey();
+  bool FindValueForCurrentKeyUsingSeek();
+  bool FindUserKeyBeforeSavedKey();
+  // If `skipping_saved_key` is true, the function will keep iterating until it
+  // finds a user key that is larger than `saved_key_`.
+  // If `prefix` is not null, the iterator needs to stop when all keys for the
+  // prefix are exhausted and the interator is set to invalid.
+  bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
+  // Internal implementation of FindNextUserEntry().
+  bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
+  bool ParseKey(ParsedInternalKey* key);
+  bool MergeValuesNewToOld();
+
+  // If prefix is not null, we need to set the iterator to invalid if no more
+  // entry can be found within the prefix.
+  void PrevInternal(const Slice* /*prefix*/);
+  bool TooManyInternalKeysSkipped(bool increment = true);
+  bool IsVisible(SequenceNumber sequence);
 
+  // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
+  // is called
+  void TempPinData() {
+    if (!pin_thru_lifetime_) {
+      pinned_iters_mgr_.StartPinning();
+    }
+  }
+
+  // Release blocks pinned by TempPinData()
+  void ReleaseTempPinnedData() {
+    if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+  }
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  inline void ResetInternalKeysSkippedCounter() {
+    local_stats_.skip_count_ += num_internal_keys_skipped_;
+    if (valid_) {
+      local_stats_.skip_count_--;
+    }
+    num_internal_keys_skipped_ = 0;
+  }
+
+  const SliceTransform* prefix_extractor_;
+  Env* const env_;
+  Logger* logger_;
+  UserComparatorWrapper user_comparator_;
+  const MergeOperator* const merge_operator_;
+  IteratorWrapper iter_;
+  ReadCallback* read_callback_;
+  // Max visible sequence number. It is normally the snapshot seq unless we have
+  // uncommitted data in db as in WriteUnCommitted.
+  SequenceNumber sequence_;
+
+  IterKey saved_key_;
+  // Reusable internal key data structure. This is only used inside one function
+  // and should not be used across functions. Reusing this object can reduce
+  // overhead of calling construction of the function if creating it each time.
+  ParsedInternalKey ikey_;
+  std::string saved_value_;
+  Slice pinned_value_;
+  // for prefix seek mode to support prev()
+  Statistics* statistics_;
+  uint64_t max_skip_;
+  uint64_t max_skippable_internal_keys_;
+  uint64_t num_internal_keys_skipped_;
+  const Slice* iterate_lower_bound_;
+  const Slice* iterate_upper_bound_;
+
+  // The prefix of the seek key. It is only used when prefix_same_as_start_
+  // is true and prefix extractor is not null. In Next() or Prev(), current keys
+  // will be checked against this prefix, so that the iterator can be
+  // invalidated if the keys in this prefix has been exhausted. Set it using
+  // SetUserKey() and use it using GetUserKey().
+  IterKey prefix_;
+
+  Status status_;
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
+  // True if we know that the current entry's seqnum is 0.
+  // This information is used as that the next entry will be for another
+  // user key.
+  bool is_key_seqnum_zero_;
+  const bool prefix_same_as_start_;
+  // Means that we will pin all data blocks we read as long the Iterator
+  // is not deleted, will be true if ReadOptions::pin_data is true
+  const bool pin_thru_lifetime_;
+  const bool total_order_seek_;
+  bool allow_blob_;
+  bool is_blob_;
+  bool arena_mode_;
+  // List of operands for merge operator.
+  MergeContext merge_context_;
+  ReadRangeDelAggregator range_del_agg_;
+  LocalStatistics local_stats_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  DBImpl* db_impl_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  ColumnFamilyData* cfd_;
+  // for diff snapshots we want the lower bound on the seqnum;
+  // if this value > 0 iterator will return internal keys
+  SequenceNumber start_seqnum_;
+};
 // Return a new iterator that converts internal keys (yielded by
-// "*internal_iter") that were live at the specified "sequence" number
+// "*internal_iter") that were live at the specified `sequence` number
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
     Env* env, const ReadOptions& read_options,
@@ -36,77 +335,4 @@ extern Iterator* NewDBIterator(
     ReadCallback* read_callback, DBImpl* db_impl = nullptr,
     ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
 
-// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
-// iterator is supposed be allocated. This class is used as an entry point of
-// a iterator hierarchy whose memory can be allocated inline. In that way,
-// accessing the iterator tree can be more cache friendly. It is also faster
-// to allocate.
-class ArenaWrappedDBIter : public Iterator {
- public:
-  virtual ~ArenaWrappedDBIter();
-
-  // Get the arena to be used to allocate memory for DBIter to be wrapped,
-  // as well as child iterators in it.
-  virtual Arena* GetArena() { return &arena_; }
-  virtual ReadRangeDelAggregator* GetRangeDelAggregator();
-
-  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
-  // a merging iterator.
-  virtual void SetIterUnderDBIter(InternalIterator* iter);
-  virtual bool Valid() const override;
-  virtual void SeekToFirst() override;
-  virtual void SeekToLast() override;
-  virtual void Seek(const Slice& target) override;
-  virtual void SeekForPrev(const Slice& target) override;
-  virtual void Next() override;
-  virtual void Prev() override;
-  virtual Slice key() const override;
-  virtual Slice value() const override;
-  virtual Status status() const override;
-  virtual Status Refresh() override;
-  bool IsBlob() const;
-
-  virtual Status GetProperty(std::string prop_name, std::string* prop) override;
-
-  void Init(Env* env, const ReadOptions& read_options,
-            const ImmutableCFOptions& cf_options,
-            const MutableCFOptions& mutable_cf_options,
-            const SequenceNumber& sequence,
-            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-            ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-            bool allow_blob, bool allow_refresh);
-
-  void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
-                        ColumnFamilyData* cfd, ReadCallback* read_callback,
-                        bool allow_blob) {
-    read_options_ = read_options;
-    db_impl_ = db_impl;
-    cfd_ = cfd;
-    read_callback_ = read_callback;
-    allow_blob_ = allow_blob;
-  }
-
- private:
-  DBIter* db_iter_;
-  Arena arena_;
-  uint64_t sv_number_;
-  ColumnFamilyData* cfd_ = nullptr;
-  DBImpl* db_impl_ = nullptr;
-  ReadOptions read_options_;
-  ReadCallback* read_callback_;
-  bool allow_blob_ = false;
-  bool allow_refresh_ = true;
-};
-
-// Generate the arena wrapped iterator class.
-// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
-// be supported.
-extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
-    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
-    ColumnFamilyData* cfd = nullptr, bool allow_blob = false,
-    bool allow_refresh = true);
 }  // namespace rocksdb
diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc
index a0f1dfeab45..b864ac4eae1 100644
--- a/db/db_iter_stress_test.cc
+++ b/db/db_iter_stress_test.cc
@@ -8,9 +8,9 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
 #include "utilities/merge_operators.h"
 
 #ifdef GFLAGS
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index 29fbd320861..1503886443b 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -17,9 +17,9 @@
 #include "rocksdb/statistics.h"
 #include "table/iterator_wrapper.h"
 #include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index ec5fc8006b8..6abe40b276b 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -9,13 +9,14 @@
 
 #include <functional>
 
+#include "db/arena_wrapped_db_iter.h"
 #include "db/db_iter.h"
 #include "db/db_test_util.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/iostats_context.h"
 #include "rocksdb/perf_context.h"
-#include "table/flush_block_policy.h"
+#include "table/block_based/flush_block_policy.h"
 
 namespace rocksdb {
 
@@ -182,6 +183,33 @@ TEST_P(DBIteratorTest, IterSeekBeforePrev) {
   delete iter;
 }
 
+TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
+  Random rnd(301);
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  table_options.block_size_deviation = 50;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.compression = kNoCompression;
+  Reopen(options);
+
+  ASSERT_OK(Put("a", RandomString(&rnd, 400)));
+  ASSERT_OK(Put("aabb", RandomString(&rnd, 400)));
+  ASSERT_OK(Put("aaef", RandomString(&rnd, 400)));
+  ASSERT_OK(Put("b", RandomString(&rnd, 400)));
+  dbfull()->Flush(FlushOptions());
+  ReadOptions opts;
+  Slice ub = Slice("aa");
+  opts.iterate_upper_bound = &ub;
+  auto iter = NewIterator(opts);
+  iter->Seek(Slice("a"));
+  ub = Slice("b");
+  iter->Seek(Slice("aabc"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aaef");
+  delete iter;
+}
+
 TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
@@ -1049,6 +1077,149 @@ TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) {
     ASSERT_EQ(upper_bound_hits, 1);
   }
 }
+
+// Enable kBinarySearchWithFirstKey, do some iterator operations and check that
+// they don't do unnecessary block reads.
+TEST_P(DBIteratorTest, IndexWithFirstKey) {
+  for (int tailing = 0; tailing < 2; ++tailing) {
+    SCOPED_TRACE("tailing = " + std::to_string(tailing));
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.prefix_extractor = nullptr;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    options.statistics = rocksdb::CreateDBStatistics();
+    Statistics* stats = options.statistics.get();
+    BlockBasedTableOptions table_options;
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+    table_options.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    table_options.block_cache =
+        NewLRUCache(8000);  // fits all blocks and their cache metadata overhead
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Merge("a1", "x1"));
+    ASSERT_OK(Merge("b1", "y1"));
+    ASSERT_OK(Merge("c0", "z1"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a2", "x2"));
+    ASSERT_OK(Merge("b2", "y2"));
+    ASSERT_OK(Merge("c0", "z2"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a3", "x3"));
+    ASSERT_OK(Merge("b3", "y3"));
+    ASSERT_OK(Merge("c3", "z3"));
+    ASSERT_OK(Flush());
+
+    // Block cache is not important for this test.
+    // We use BLOCK_CACHE_DATA_* counters just because they're the most readily
+    // available way of counting block accesses.
+
+    ReadOptions ropt;
+    ropt.tailing = tailing;
+    std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+    iter->Seek("b10");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b3", iter->key().ToString());
+    EXPECT_EQ("y3", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Seek("c0");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c0", iter->key().ToString());
+    EXPECT_EQ("z1,z2", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c3", iter->key().ToString());
+    EXPECT_EQ("z3", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter.reset();
+
+    // Enable iterate_upper_bound and check that iterator is not trying to read
+    // blocks that are fully above upper bound.
+    std::string ub = "b3";
+    Slice ub_slice(ub);
+    ropt.iterate_upper_bound = &ub_slice;
+    iter.reset(NewIterator(ropt));
+
+    iter->Seek("b2");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(5, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  }
+}
+
+TEST_P(DBIteratorTest, IndexWithFirstKeyGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor = nullptr;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.statistics = rocksdb::CreateDBStatistics();
+  Statistics* stats = options.statistics.get();
+  BlockBasedTableOptions table_options;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  table_options.block_cache = NewLRUCache(1000);  // fits all blocks
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("a", "x1"));
+  ASSERT_OK(Merge("c", "y1"));
+  ASSERT_OK(Merge("e", "z1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("c", "y2"));
+  ASSERT_OK(Merge("e", "z2"));
+  ASSERT_OK(Flush());
+
+  // Get() between blocks shouldn't read any blocks.
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  // Get() of an existing key shouldn't read any unnecessary blocks when there's
+  // only one key per block.
+
+  ASSERT_EQ("y1,y2", Get("c"));
+  EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  ASSERT_EQ("x1", Get("a"));
+  EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  EXPECT_EQ(std::vector<std::string>({"NOT_FOUND", "z1,z2"}),
+            MultiGet({"b", "e"}));
+}
+
 // TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
 //             return the biggest key which is smaller than the seek key.
 TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
@@ -2450,6 +2621,192 @@ TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
   ASSERT_EQ("a", it->key().ToString());
 }
 
+TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 800;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string random_str = RandomString(&rnd, 180);
+
+  ASSERT_OK(Put("1", random_str));
+  ASSERT_OK(Put("2", random_str));
+  ASSERT_OK(Put("3", random_str));
+  ASSERT_OK(Put("4", random_str));
+  // A new block
+  ASSERT_OK(Put("5", random_str));
+  ASSERT_OK(Put("6", random_str));
+  ASSERT_OK(Put("7", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("8", random_str));
+  ASSERT_OK(Put("9", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  int num_find_file_in_level = 0;
+  int num_idx_blk_seek = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelIterator::Seek:BeforeFindFile",
+      [&](void* /*arg*/) { num_find_file_in_level++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+    iter->Seek("1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("6");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("7");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(3, num_idx_blk_seek);
+
+    iter->Seek("8");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(2, num_find_file_in_level);
+    // Still re-seek because "8" is the boundary key, which has
+    // the same user key as the seek key.
+    ASSERT_EQ(4, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    // Seek backward never triggers the index block seek to be skipped
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(6, num_idx_blk_seek);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// MyRocks may change iterate bounds before seek. Simply test to make sure such
+// usage doesn't break iterator.
+TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::string value(50, 'v');
+  Reopen(options);
+  ASSERT_OK(Put("aaa", value));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("eee", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  std::string ub1 = "e";
+  std::string ub2 = "c";
+  Slice ub(ub1);
+  ReadOptions read_opts1;
+  read_opts1.iterate_upper_bound = &ub;
+  Iterator* iter = NewIterator(read_opts1);
+  // Seek and iterate accross block boundary.
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  ub = Slice(ub2);
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+
+  std::string lb1 = "a";
+  std::string lb2 = "c";
+  Slice lb(lb1);
+  ReadOptions read_opts2;
+  read_opts2.iterate_lower_bound = &lb;
+  iter = NewIterator(read_opts2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  lb = Slice(lb2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) {
+  ASSERT_OK(Put("aaa", "v"));
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  // Move both files to bottom level.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Slice lower_bound("b");
+  ReadOptions read_opts;
+  read_opts.iterate_lower_bound = &lower_bound;
+  std::unique_ptr<Iterator> iter(NewIterator(read_opts));
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
 INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
                         testing::Values(true, false));
 
diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index 294d0f581bc..d9ad649e736 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -204,6 +204,76 @@ TEST_F(DBMemTableTest, DuplicateSeq) {
   delete mem;
 }
 
+// A simple test to verify that the concurrent merge writes is functional
+TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
+  int num_ops = 1000;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  Options options;
+  // A merge operator that is not sensitive to concurrent writes since in this
+  // test we don't order the writes.
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  options.allow_concurrent_memtable_write = true;
+  ImmutableCFOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+
+  // Put 0 as the base
+  PutFixed64(&value, static_cast<uint64_t>(0));
+  bool res = mem->Add(0, kTypeValue, "key", value);
+  ASSERT_TRUE(res);
+  value.clear();
+
+  // Write Merge concurrently
+  rocksdb::port::Thread write_thread1([&]() {
+  MemTablePostProcessInfo post_process_info1;
+    std::string v1;
+    for (int seq = 1; seq < num_ops / 2; seq++) {
+      PutFixed64(&v1, seq);
+      bool res1 =
+          mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1);
+      ASSERT_TRUE(res1);
+      v1.clear();
+    }
+  });
+  rocksdb::port::Thread write_thread2([&]() {
+  MemTablePostProcessInfo post_process_info2;
+    std::string v2;
+    for (int seq = num_ops / 2; seq < num_ops; seq++) {
+      PutFixed64(&v2, seq);
+      bool res2 =
+          mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2);
+      ASSERT_TRUE(res2);
+      v2.clear();
+    }
+  });
+  write_thread1.join();
+  write_thread2.join();
+
+  Status status;
+  ReadOptions roptions;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  LookupKey lkey("key", kMaxSequenceNumber);
+  res = mem->Get(lkey, &value, &status, &merge_context,
+                 &max_covering_tombstone_seq, roptions);
+  ASSERT_TRUE(res);
+  uint64_t ivalue = DecodeFixed64(Slice(value).data());
+  uint64_t sum = 0;
+  for (int seq = 0; seq < num_ops; seq++) {
+    sum += seq;
+  }
+  ASSERT_EQ(ivalue, sum);
+
+  delete mem;
+}
+
 TEST_F(DBMemTableTest, InsertWithHint) {
   Options options;
   options.allow_concurrent_memtable_write = false;
@@ -252,7 +322,7 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
 
-  for (int cf = 0; cf < 2; ++cf) {
+  for (uint32_t cf = 0; cf < 2; ++cf) {
     ASSERT_OK(Put(cf, "key", "val"));
     ASSERT_OK(Flush(cf));
     ASSERT_EQ(
diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc
new file mode 100644
index 00000000000..e6280ad8c79
--- /dev/null
+++ b/db/db_merge_operand_test.cc
@@ -0,0 +1,240 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/fault_injection_test_env.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace rocksdb {
+
+class DBMergeOperandTest : public DBTestBase {
+ public:
+  DBMergeOperandTest() : DBTestBase("/db_merge_operand_test") {}
+};
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
+  class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+   public:
+    LimitedStringAppendMergeOp(int limit, char delim)
+        : StringAppendTESTOperator(delim), limit_(limit) {}
+
+    const char* Name() const override {
+      return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+    }
+
+    bool ShouldMerge(const std::vector<Slice>& operands) const override {
+      if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+        return true;
+      }
+      return false;
+    }
+
+   private:
+    size_t limit_ = 0;
+  };
+
+  Options options;
+  options.create_if_missing = true;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // k0 value in memtable
+  Put("k0", "PutARock");
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "PutARock");
+
+  // k0.1 value in SST
+  Put("k0.1", "RockInSST");
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "RockInSST");
+
+  // All k1 values are in memtable.
+  ASSERT_OK(Merge("k1", "a"));
+  Put("k1", "x");
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "x");
+  ASSERT_EQ(values[1], "b");
+  ASSERT_EQ(values[2], "c");
+  ASSERT_EQ(values[3], "d");
+
+  // expected_max_number_of_operands is less than number of merge operands so
+  // status should be Incomplete.
+  merge_operands_info.expected_max_number_of_operands = num_records - 1;
+  Status status = db_->GetMergeOperands(
+      ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+      &merge_operands_info, &number_of_operands);
+  ASSERT_EQ(status.IsIncomplete(), true);
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k1.1 values are in memtable.
+  ASSERT_OK(Merge("k1.1", "r"));
+  Delete("k1.1");
+  ASSERT_OK(Merge("k1.1", "c"));
+  ASSERT_OK(Merge("k1.1", "k"));
+  ASSERT_OK(Merge("k1.1", "s"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "c");
+  ASSERT_EQ(values[1], "k");
+  ASSERT_EQ(values[2], "s");
+
+  // All k2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2", "q"));
+  ASSERT_OK(Merge("k2", "w"));
+  ASSERT_OK(Merge("k2", "e"));
+  ASSERT_OK(Merge("k2", "r"));
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "q");
+  ASSERT_EQ(values[1], "w");
+  ASSERT_EQ(values[2], "e");
+  ASSERT_EQ(values[3], "r");
+
+  // All k2.1 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.1", "m"));
+  Put("k2.1", "l");
+  ASSERT_OK(Merge("k2.1", "n"));
+  ASSERT_OK(Merge("k2.1", "o"));
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "l,n,o");
+
+  // All k2.2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.2", "g"));
+  Delete("k2.2");
+  ASSERT_OK(Merge("k2.2", "o"));
+  ASSERT_OK(Merge("k2.2", "t"));
+  ASSERT_OK(Flush());
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "o,t");
+
+  // Do some compaction that will make the following tests more predictable
+  //  Slice start("PutARock");
+  //  Slice end("t");
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // All k3 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "ab");
+  ASSERT_EQ(values[1], "bc");
+  ASSERT_EQ(values[2], "cd");
+  ASSERT_EQ(values[3], "de");
+
+  // All k3.1 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.1", "ab"));
+  ASSERT_OK(Flush());
+  Put("k3.1", "bc");
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "de"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "bc");
+  ASSERT_EQ(values[1], "cd");
+  ASSERT_EQ(values[2], "de");
+
+  // All k3.2 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.2", "ab"));
+  ASSERT_OK(Flush());
+  Delete("k3.2");
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "de"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "cd");
+  ASSERT_EQ(values[1], "de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Merge("k4", "ba"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "cb"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "dc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "ed"));
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "ba");
+  ASSERT_EQ(values[1], "cb");
+  ASSERT_EQ(values[2], "dc");
+  ASSERT_EQ(values[3], "ed");
+
+  // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable
+  ASSERT_OK(Merge("k5", "who"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Flush());
+  Put("k5", "remember");
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "rocks"));
+  dbfull()->TEST_SwitchMemtable();
+  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5",
+                        values.data(), &merge_operands_info,
+                        &number_of_operands);
+  ASSERT_EQ(values[0], "remember");
+  ASSERT_EQ(values[1], "i");
+  ASSERT_EQ(values[2], "am");
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc
index 2b5e4a445ea..8358ddb56c2 100644
--- a/db/db_merge_operator_test.cc
+++ b/db/db_merge_operator_test.cc
@@ -46,9 +46,11 @@ class DBMergeOperatorTest : public DBTestBase {
     ReadOptions read_opt;
     read_opt.snapshot = snapshot;
     PinnableSlice value;
-    Status s =
-        dbfull()->GetImpl(read_opt, db_->DefaultColumnFamily(), key, &value,
-                          nullptr /*value_found*/, &read_callback);
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = db_->DefaultColumnFamily();
+    get_impl_options.value = &value;
+    get_impl_options.callback = &read_callback;
+    Status s = dbfull()->GetImpl(read_opt, key, get_impl_options);
     if (!s.ok()) {
       return s.ToString();
     }
@@ -275,68 +277,6 @@ TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) {
   VerifyDBFromMap(true_data);
 }
 
-TEST_P(MergeOperatorPinningTest, Randomized) {
-  do {
-    Options options = CurrentOptions();
-    options.merge_operator = MergeOperators::CreateMaxOperator();
-    BlockBasedTableOptions table_options;
-    table_options.no_block_cache = disable_block_cache_;
-    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-    DestroyAndReopen(options);
-
-    Random rnd(301);
-    std::map<std::string, std::string> true_data;
-
-    const int kTotalMerges = 5000;
-    // Every key gets ~10 operands
-    const int kKeyRange = kTotalMerges / 10;
-    const int kOperandSize = 20;
-    const int kNumPutBefore = kKeyRange / 10;  // 10% value
-    const int kNumPutAfter = kKeyRange / 10;   // 10% overwrite
-    const int kNumDelete = kKeyRange / 10;     // 10% delete
-
-    // kNumPutBefore keys will have base values
-    for (int i = 0; i < kNumPutBefore; i++) {
-      std::string key = Key(rnd.Next() % kKeyRange);
-      std::string value = RandomString(&rnd, kOperandSize);
-      ASSERT_OK(db_->Put(WriteOptions(), key, value));
-
-      true_data[key] = value;
-    }
-
-    // Do kTotalMerges merges
-    for (int i = 0; i < kTotalMerges; i++) {
-      std::string key = Key(rnd.Next() % kKeyRange);
-      std::string value = RandomString(&rnd, kOperandSize);
-      ASSERT_OK(db_->Merge(WriteOptions(), key, value));
-
-      if (true_data[key] < value) {
-        true_data[key] = value;
-      }
-    }
-
-    // Overwrite random kNumPutAfter keys
-    for (int i = 0; i < kNumPutAfter; i++) {
-      std::string key = Key(rnd.Next() % kKeyRange);
-      std::string value = RandomString(&rnd, kOperandSize);
-      ASSERT_OK(db_->Put(WriteOptions(), key, value));
-
-      true_data[key] = value;
-    }
-
-    // Delete random kNumDelete keys
-    for (int i = 0; i < kNumDelete; i++) {
-      std::string key = Key(rnd.Next() % kKeyRange);
-      ASSERT_OK(db_->Delete(WriteOptions(), key));
-
-      true_data.erase(key);
-    }
-
-    VerifyDBFromMap(true_data);
-
-  } while (ChangeOptions(kSkipMergePut));
-}
-
 class MergeOperatorHook : public MergeOperator {
  public:
   explicit MergeOperatorHook(std::shared_ptr<MergeOperator> _merge_op)
@@ -637,6 +577,86 @@ TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) {
   db_->ReleaseSnapshot(snapshot2);
 }
 
+class PerConfigMergeOperatorPinningTest
+    : public DBMergeOperatorTest,
+      public testing::WithParamInterface<std::tuple<bool, int>> {
+ public:
+  PerConfigMergeOperatorPinningTest() {
+    std::tie(disable_block_cache_, option_config_) = GetParam();
+  }
+
+  bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest,
+    ::testing::Combine(::testing::Bool(),
+                       ::testing::Range(static_cast<int>(DBTestBase::kDefault),
+                                        static_cast<int>(DBTestBase::kEnd))));
+
+TEST_P(PerConfigMergeOperatorPinningTest, Randomized) {
+  if (ShouldSkipOptions(option_config_, kSkipMergePut)) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateMaxOperator();
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+
+  const int kTotalMerges = 5000;
+  // Every key gets ~10 operands
+  const int kKeyRange = kTotalMerges / 10;
+  const int kOperandSize = 20;
+  const int kNumPutBefore = kKeyRange / 10;  // 10% value
+  const int kNumPutAfter = kKeyRange / 10;   // 10% overwrite
+  const int kNumDelete = kKeyRange / 10;     // 10% delete
+
+  // kNumPutBefore keys will have base values
+  for (int i = 0; i < kNumPutBefore; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = RandomString(&rnd, kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Do kTotalMerges merges
+  for (int i = 0; i < kTotalMerges; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = RandomString(&rnd, kOperandSize);
+    ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+
+    if (true_data[key] < value) {
+      true_data[key] = value;
+    }
+  }
+
+  // Overwrite random kNumPutAfter keys
+  for (int i = 0; i < kNumPutAfter; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = RandomString(&rnd, kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Delete random kNumDelete keys
+  for (int i = 0; i < kNumDelete; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    ASSERT_OK(db_->Delete(WriteOptions(), key));
+
+    true_data.erase(key);
+  }
+
+  VerifyDBFromMap(true_data);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index a7ecf12744b..cb031e62eb4 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -11,7 +11,7 @@
 #include <unordered_map>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "options/options_helper.h"
 #include "port/stack_trace.h"
@@ -19,14 +19,12 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/stats_history.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
 #include "util/random.h"
-#include "util/sync_point.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
-const int kMicrosInSec = 1000000;
-
 class DBOptionsTest : public DBTestBase {
  public:
   DBOptionsTest() : DBTestBase("/db_options_test") {}
@@ -66,10 +64,10 @@ class DBOptionsTest : public DBTestBase {
 
   std::unordered_map<std::string, std::string> GetRandomizedMutableCFOptionsMap(
       Random* rnd) {
-    Options options;
+    Options options = CurrentOptions();
     options.env = env_;
     ImmutableDBOptions db_options(options);
-    test::RandomInitCFOptions(&options, rnd);
+    test::RandomInitCFOptions(&options, options, rnd);
     auto sanitized_options = SanitizeOptions(db_options, options);
     auto opt_map = GetMutableCFOptionsMap(sanitized_options);
     delete options.compaction_filter;
@@ -507,10 +505,10 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
   options.stats_dump_period_sec = 5;
   options.env = env_;
   Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
 
   for (int i = 0; i < 20; i++) {
-    int num = rand() % 5000 + 1;
+    unsigned int num = rand() % 5000 + 1;
     ASSERT_OK(
         dbfull()->SetDBOptions({{"stats_dump_period_sec", ToString(num)}}));
     ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec);
@@ -518,283 +516,18 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
   Close();
 }
 
-TEST_F(DBOptionsTest, RunStatsDumpPeriodSec) {
-  Options options;
-  options.create_if_missing = true;
-  options.stats_dump_period_sec = 5;
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0); // in seconds
-  options.env = mock_env.get();
-  int counter = 0;
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-#endif  // OS_MACOSX && !NDEBUG
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::DumpStats:1", [&](void* /*arg*/) {
-        counter++;
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec);
-  dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); });
-  ASSERT_GE(counter, 1);
-
-  // Test cacel job through SetOptions
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}));
-  int old_val = counter;
-  for (int i = 6; i < 20; ++i) {
-    dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); });
-  }
-  ASSERT_EQ(counter, old_val);
-  Close();
-}
-
-// Test persistent stats background thread scheduling and cancelling
-TEST_F(DBOptionsTest, StatsPersistScheduling) {
-  Options options;
-  options.create_if_missing = true;
-  options.stats_persist_period_sec = 5;
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-#endif  // OS_MACOSX && !NDEBUG
-  int counter = 0;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
-  ASSERT_GE(counter, 1);
-
-  // Test cacel job through SetOptions
-  ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled());
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
-  ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled());
-  Close();
-}
-
-// Test enabling persistent stats for the first time
-TEST_F(DBOptionsTest, PersistentStatsFreshInstall) {
-  Options options;
-  options.create_if_missing = true;
-  options.stats_persist_period_sec = 0;
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-#endif  // OS_MACOSX && !NDEBUG
-  int counter = 0;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  Reopen(options);
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}}));
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
-  ASSERT_GE(counter, 1);
-  Close();
-}
-
 TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
   Options options;
   options.create_if_missing = true;
   options.stats_persist_period_sec = 5;
   options.env = env_;
   Reopen(options);
-  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
 
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}}));
-  ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
   ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}}));
-  ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec);
-}
-
-TEST_F(DBOptionsTest, GetStatsHistory) {
-  Options options;
-  options.create_if_missing = true;
-  options.stats_persist_period_sec = 5;
-  options.statistics = rocksdb::CreateDBStatistics();
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-#endif  // OS_MACOSX && !NDEBUG
-
-  CreateColumnFamilies({"pikachu"}, options);
-  ASSERT_OK(Put("foo", "bar"));
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-
-  int mock_time = 1;
-  // Wait for stats persist to finish
-  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
-  std::unique_ptr<StatsHistoryIterator> stats_iter;
-  db_->GetStatsHistory(0, 6 * kMicrosInSec, &stats_iter);
-  ASSERT_TRUE(stats_iter != nullptr);
-  // disabled stats snapshots
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
-  size_t stats_count = 0;
-  for (; stats_iter->Valid(); stats_iter->Next()) {
-    auto stats_map = stats_iter->GetStatsMap();
-    stats_count += stats_map.size();
-  }
-  ASSERT_GT(stats_count, 0);
-  // Wait a bit and verify no more stats are found
-  for (mock_time = 6; mock_time < 20; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
-  }
-  db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter);
-  ASSERT_TRUE(stats_iter != nullptr);
-  size_t stats_count_new = 0;
-  for (; stats_iter->Valid(); stats_iter->Next()) {
-    stats_count_new += stats_iter->GetStatsMap().size();
-  }
-  ASSERT_EQ(stats_count_new, stats_count);
-  Close();
-}
-
-TEST_F(DBOptionsTest, InMemoryStatsHistoryPurging) {
-  Options options;
-  options.create_if_missing = true;
-  options.statistics = rocksdb::CreateDBStatistics();
-  options.stats_persist_period_sec = 1;
-  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
-  mock_env.reset(new rocksdb::MockTimeEnv(env_));
-  mock_env->set_current_time(0);  // in seconds
-  options.env = mock_env.get();
-#if defined(OS_MACOSX) && !defined(NDEBUG)
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
-        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-#endif  // OS_MACOSX && !NDEBUG
-
-  CreateColumnFamilies({"pikachu"}, options);
-  ASSERT_OK(Put("foo", "bar"));
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  // some random operation to populate statistics
-  ASSERT_OK(Delete("foo"));
-  ASSERT_OK(Put("sol", "sol"));
-  ASSERT_OK(Put("epic", "epic"));
-  ASSERT_OK(Put("ltd", "ltd"));
-  ASSERT_EQ("sol", Get("sol"));
-  ASSERT_EQ("epic", Get("epic"));
-  ASSERT_EQ("ltd", Get("ltd"));
-  Iterator* iterator = db_->NewIterator(ReadOptions());
-  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
-    ASSERT_TRUE(iterator->key() == iterator->value());
-  }
-  delete iterator;
-  ASSERT_OK(Flush());
-  ASSERT_OK(Delete("sol"));
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-  int mock_time = 1;
-  // Wait for stats persist to finish
-  for (; mock_time < 5; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
-  }
-
-  // second round of ops
-  ASSERT_OK(Put("saigon", "saigon"));
-  ASSERT_OK(Put("noodle talk", "noodle talk"));
-  ASSERT_OK(Put("ping bistro", "ping bistro"));
-  iterator = db_->NewIterator(ReadOptions());
-  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
-    ASSERT_TRUE(iterator->key() == iterator->value());
-  }
-  delete iterator;
-  ASSERT_OK(Flush());
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-  for (; mock_time < 10; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
-  }
-  std::unique_ptr<StatsHistoryIterator> stats_iter;
-  db_->GetStatsHistory(0, 10 * kMicrosInSec, &stats_iter);
-  ASSERT_TRUE(stats_iter != nullptr);
-  size_t stats_count = 0;
-  int slice_count = 0;
-  for (; stats_iter->Valid(); stats_iter->Next()) {
-    slice_count++;
-    auto stats_map = stats_iter->GetStatsMap();
-    stats_count += stats_map.size();
-  }
-  size_t stats_history_size = dbfull()->TEST_EstiamteStatsHistorySize();
-  ASSERT_GE(slice_count, 9);
-  ASSERT_GE(stats_history_size, 12000);
-  // capping memory cost at 12000 bytes since one slice is around 10000~12000
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}}));
-  ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size);
-  // Wait for stats persist to finish
-  for (; mock_time < 20; ++mock_time) {
-    dbfull()->TEST_WaitForPersistStatsRun(
-        [&] { mock_env->set_current_time(mock_time); });
-  }
-  db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter);
-  ASSERT_TRUE(stats_iter != nullptr);
-  size_t stats_count_reopen = 0;
-  slice_count = 0;
-  for (; stats_iter->Valid(); stats_iter->Next()) {
-    slice_count++;
-    auto stats_map = stats_iter->GetStatsMap();
-    stats_count_reopen += stats_map.size();
-  }
-  size_t stats_history_size_reopen = dbfull()->TEST_EstiamteStatsHistorySize();
-  // only one slice can fit under the new stats_history_buffer_size
-  ASSERT_LT(slice_count, 2);
-  ASSERT_TRUE(stats_history_size_reopen < 12000 &&
-              stats_history_size_reopen > 0);
-  ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0);
-  Close();
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
 }
 
 static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
@@ -873,6 +606,76 @@ TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
   ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
 }
 
+TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 100;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 500;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBOptionsTest, SanitizeTtlDefault) {
+  Options options;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.compaction_style = kCompactionStyleLevel;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100 * 24 * 60 * 60;
+  Reopen(options);
+  ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 200;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(200, dbfull()->GetOptions().ttl);
+
+  options.ttl = 500;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(300, dbfull()->GetOptions().ttl);
+}
+
 TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
   Options options;
   options.compaction_style = kCompactionStyleFIFO;
@@ -1007,6 +810,53 @@ TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) {
   ASSERT_EQ(256, env_->compaction_readahead_size_);
   Close();
 }
+
+TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.create_if_missing = true;
+
+  ASSERT_OK(TryReopen(options));
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // In release 6.0, ttl was promoted from a secondary level option under
+  // compaction_options_fifo to a top level option under ColumnFamilyOptions.
+  // We still need to handle old SetOptions calls but should ignore
+  // ttl under compaction_options_fifo.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"},
+       {"ttl", "60"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+
+  // Put ttl as the first option inside compaction_options_fifo. That works as
+  // it doesn't overwrite any other option.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"},
+       {"ttl", "191"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 191);
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 1a988f5ea4c..57206f6edb2 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -210,12 +210,11 @@ void VerifySimilar(uint64_t a, uint64_t b, double bias) {
   }
 }
 
-void VerifyTableProperties(const TableProperties& base_tp,
-                           const TableProperties& new_tp,
-                           double filter_size_bias = 0.1,
-                           double index_size_bias = 0.1,
-                           double data_size_bias = 0.1,
-                           double num_data_blocks_bias = 0.05) {
+void VerifyTableProperties(
+    const TableProperties& base_tp, const TableProperties& new_tp,
+    double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.15 : 0.1,
+    double index_size_bias = 0.1, double data_size_bias = 0.1,
+    double num_data_blocks_bias = 0.05) {
   VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias);
   VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias);
   VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
@@ -266,7 +265,8 @@ void GetExpectedTableProperties(
        // discount 1 byte as value size is not encoded in value delta encoding
        (value_delta_encoding ? 1 : 0));
   expected_tp->filter_size =
-      kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
+      kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 +
+                     /*average-ish overhead*/ CACHE_LINE_SIZE / 2);
 }
 }  // anonymous namespace
 
@@ -615,8 +615,9 @@ TEST_F(DBPropertiesTest, NumImmutableMemTable) {
     writeOpt.disableWAL = true;
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
-    options.max_write_buffer_number_to_maintain = 4;
     options.write_buffer_size = 1000000;
+    options.max_write_buffer_size_to_maintain =
+        5 * static_cast<int64_t>(options.write_buffer_size);
     CreateAndReopenWithCF({"pikachu"}, options);
 
     std::string big_value(1000000 * 2, 'x');
@@ -747,7 +748,7 @@ TEST_F(DBPropertiesTest, DISABLED_GetProperty) {
   options.max_background_flushes = 1;
   options.max_write_buffer_number = 10;
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 0;
+  options.max_write_buffer_size_to_maintain = 0;
   options.write_buffer_size = 1000000;
   Reopen(options);
 
@@ -997,7 +998,7 @@ TEST_F(DBPropertiesTest, EstimatePendingCompBytes) {
   options.max_background_flushes = 1;
   options.max_write_buffer_number = 10;
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 0;
+  options.max_write_buffer_size_to_maintain = 0;
   options.write_buffer_size = 1000000;
   Reopen(options);
 
@@ -1630,7 +1631,11 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) {
 
   // Test with empty block cache.
   constexpr size_t kCapacity = 100;
-  auto block_cache = NewLRUCache(kCapacity, 0 /*num_shard_bits*/);
+  LRUCacheOptions co;
+  co.capacity = kCapacity;
+  co.num_shard_bits = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto block_cache = NewLRUCache(co);
   table_options.block_cache = block_cache;
   table_options.no_block_cache = false;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index aa63286f60a..ec448b731e0 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -5,7 +5,7 @@
 
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -491,6 +491,30 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
   ASSERT_EQ(expected, actual);
 }
 
+TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
+  // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
+  // Flush. The `CompactionIterator` previously had a bug where we forgot to
+  // check for covering range tombstones when processing the (1) Put, causing
+  // it to reappear after the flush.
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  std::string val;
+  PutFixed64(&val, 1);
+  ASSERT_OK(db_->Put(WriteOptions(), "key", val));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "key", "key_"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 1);
+  ASSERT_EQ(expected, actual);
+}
+
 // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
 TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
@@ -1497,6 +1521,84 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
   ASSERT_EQ(1, num_range_deletions);
 }
 
+TEST_F(DBRangeDelTest, OverlappedTombstones) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, 3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+                             Key((kNumFiles)*kNumPerFile + 1)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+
+  // The tombstone range is not broken up into multiple SSTs which may incur a
+  // large compaction with L2.
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, OverlappedKeys) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(RandomString(&rnd, 3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) {
+    ASSERT_OK(Put(Key(i), "0x123"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // The key range is broken up into three SSTs to avoid a future big compaction
+  // with the grandparent
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index 9003ed6b1ac..37adee46722 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -8,10 +8,10 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_manager.h"
-#include "util/sst_file_manager_impl.h"
 
 namespace rocksdb {
 
@@ -430,6 +430,7 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) {
   env_->time_elapse_only_sleep_ = true;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
   options.env = env_;
 
   int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
@@ -439,7 +440,7 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) {
   ASSERT_OK(s);
   options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
   auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
-  sfm->delete_scheduler()->SetMaxTrashDBRatio(2.1);
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
 
   ASSERT_OK(TryReopen(options));
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
@@ -469,6 +470,111 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+class DBWALTestWithParam
+    : public DBSSTTest,
+      public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+  DBWALTestWithParam() {
+    wal_dir_ = std::get<0>(GetParam());
+    wal_dir_same_as_dbname_ = std::get<1>(GetParam());
+  }
+
+  std::string wal_dir_;
+  bool wal_dir_same_as_dbname_;
+};
+
+TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
+  class MyEnv : public EnvWrapper {
+   public:
+    MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {}
+
+    Status DeleteFile(const std::string& fname) {
+      if (fname.find(".log.trash") != std::string::npos && fake_log_delete) {
+        return Status::OK();
+      }
+
+      return target()->DeleteFile(fname);
+    }
+
+    void set_fake_log_delete(bool fake) { fake_log_delete = fake; }
+
+   private:
+    bool fake_log_delete;
+  };
+
+  std::unique_ptr<MyEnv> env(new MyEnv(Env::Default()));
+  Destroy(last_options_);
+
+  env->set_fake_log_delete(true);
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.wal_dir = dbname_ + wal_dir_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+
+  ASSERT_OK(TryReopen(options));
+
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  Close();
+
+  options.sst_file_manager.reset();
+  std::vector<std::string> filenames;
+  int trash_log_count = 0;
+  if (!wal_dir_same_as_dbname_) {
+    // Forcibly create some trash log files
+    std::unique_ptr<WritableFile> result;
+    env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
+                         EnvOptions());
+    result.reset();
+  }
+  env->GetChildren(options.wal_dir, &filenames);
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_GE(trash_log_count, 1);
+
+  env->set_fake_log_delete(false);
+  ASSERT_OK(TryReopen(options));
+
+  filenames.clear();
+  trash_log_count = 0;
+  env->GetChildren(options.wal_dir, &filenames);
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_EQ(trash_log_count, 0);
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam,
+                        ::testing::Values(std::make_tuple("", true),
+                                          std::make_tuple("_wal_dir", false)));
+
 TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
   Options options = CurrentOptions();
 
diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index 5a54fd81c05..164042bc277 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -14,8 +14,8 @@
 #include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/table_properties_collectors.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 #ifndef ROCKSDB_LITE
 
@@ -139,12 +139,12 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
   Options options;
   options.create_if_missing = true;
   options.write_buffer_size = 4096;
-  options.max_write_buffer_number = 3;
+  options.max_write_buffer_number = 2;
   options.level0_file_num_compaction_trigger = 2;
   options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 2;
   options.target_file_size_base = 2048;
-  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_base = 40960;
   options.max_bytes_for_level_multiplier = 4;
   options.hard_pending_compaction_bytes_limit = 16 * 1024;
   options.num_levels = 8;
@@ -230,7 +230,7 @@ TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) {
 
   // Create one table per CF, then verify it was created with the column family
   // name property.
-  for (int cf = 0; cf < 2; ++cf) {
+  for (uint32_t cf = 0; cf < 2; ++cf) {
     Put(cf, "key", "val");
     Flush(cf);
 
diff --git a/db/db_test.cc b/db/db_test.cc
index 8a112e48fcd..16d1a4deebe 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -24,13 +24,15 @@
 #endif
 
 #include "cache/lru_cache.h"
-#include "db/db_impl.h"
+#include "db/blob_index.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
 #include "db/job_context.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
@@ -53,19 +55,17 @@
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/compression.h"
-#include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -883,7 +883,7 @@ TEST_F(DBTest, FlushMultipleMemtable) {
     writeOpt.disableWAL = true;
     options.max_write_buffer_number = 4;
     options.min_write_buffer_number_to_merge = 3;
-    options.max_write_buffer_number_to_maintain = -1;
+    options.max_write_buffer_size_to_maintain = -1;
     CreateAndReopenWithCF({"pikachu"}, options);
     ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
     ASSERT_OK(Flush(1));
@@ -901,7 +901,8 @@ TEST_F(DBTest, FlushSchedule) {
   options.level0_stop_writes_trigger = 1 << 10;
   options.level0_slowdown_writes_trigger = 1 << 10;
   options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number_to_maintain = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
   options.max_write_buffer_number = 2;
   options.write_buffer_size = 120 * 1024;
   CreateAndReopenWithCF({"pikachu"}, options);
@@ -1019,39 +1020,149 @@ TEST_F(DBTest, FailMoreDbPaths) {
   ASSERT_TRUE(TryReopen(options).IsNotSupported());
 }
 
-void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) {
+void CheckColumnFamilyMeta(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::vector<std::vector<FileMetaData>>& files_by_level,
+    uint64_t start_time, uint64_t end_time) {
+  ASSERT_EQ(cf_meta.name, kDefaultColumnFamilyName);
+  ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
+
   uint64_t cf_size = 0;
-  uint64_t cf_csize = 0;
   size_t file_count = 0;
-  for (auto level_meta : cf_meta.levels) {
+
+  for (size_t i = 0; i < cf_meta.levels.size(); ++i) {
+    const auto& level_meta_from_cf = cf_meta.levels[i];
+    const auto& level_meta_from_files = files_by_level[i];
+
+    ASSERT_EQ(level_meta_from_cf.level, i);
+    ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size());
+
+    file_count += level_meta_from_cf.files.size();
+
     uint64_t level_size = 0;
-    uint64_t level_csize = 0;
-    file_count += level_meta.files.size();
-    for (auto file_meta : level_meta.files) {
-      level_size += file_meta.size;
+    for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) {
+      const auto& file_meta_from_cf = level_meta_from_cf.files[j];
+      const auto& file_meta_from_files = level_meta_from_files[j];
+
+      level_size += file_meta_from_cf.size;
+
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                file_meta_from_files.fd.GetNumber());
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                TableFileNameToNumber(file_meta_from_cf.name));
+      ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size);
+      ASSERT_EQ(file_meta_from_cf.smallest_seqno,
+                file_meta_from_files.fd.smallest_seqno);
+      ASSERT_EQ(file_meta_from_cf.largest_seqno,
+                file_meta_from_files.fd.largest_seqno);
+      ASSERT_EQ(file_meta_from_cf.smallestkey,
+                file_meta_from_files.smallest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.largestkey,
+                file_meta_from_files.largest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
+                file_meta_from_files.oldest_blob_file_number);
+      ASSERT_EQ(file_meta_from_cf.oldest_ancester_time,
+                file_meta_from_files.oldest_ancester_time);
+      ASSERT_EQ(file_meta_from_cf.file_creation_time,
+                file_meta_from_files.file_creation_time);
+      ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
+      ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
+      ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
+      ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
     }
-    ASSERT_EQ(level_meta.size, level_size);
+
+    ASSERT_EQ(level_meta_from_cf.size, level_size);
     cf_size += level_size;
-    cf_csize += level_csize;
   }
+
   ASSERT_EQ(cf_meta.file_count, file_count);
   ASSERT_EQ(cf_meta.size, cf_size);
 }
 
+void CheckLiveFilesMeta(
+    const std::vector<LiveFileMetaData>& live_file_meta,
+    const std::vector<std::vector<FileMetaData>>& files_by_level) {
+  size_t total_file_count = 0;
+  for (const auto& f : files_by_level) {
+    total_file_count += f.size();
+  }
+
+  ASSERT_EQ(live_file_meta.size(), total_file_count);
+
+  int level = 0;
+  int i = 0;
+
+  for (const auto& meta : live_file_meta) {
+    if (level != meta.level) {
+      level = meta.level;
+      i = 0;
+    }
+
+    ASSERT_LT(i, files_by_level[level].size());
+
+    const auto& expected_meta = files_by_level[level][i];
+
+    ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName);
+    ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber());
+    ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name));
+    ASSERT_EQ(meta.size, expected_meta.fd.file_size);
+    ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno);
+    ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno);
+    ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString());
+    ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
+    ASSERT_EQ(meta.oldest_blob_file_number,
+              expected_meta.oldest_blob_file_number);
+
+    ++i;
+  }
+}
+
 #ifndef ROCKSDB_LITE
-TEST_F(DBTest, ColumnFamilyMetaDataTest) {
+TEST_F(DBTest, MetaDataTest) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  int64_t temp_time = 0;
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t start_time = static_cast<uint64_t>(temp_time);
+
   DestroyAndReopen(options);
 
   Random rnd(301);
   int key_index = 0;
-  ColumnFamilyMetaData cf_meta;
   for (int i = 0; i < 100; ++i) {
-    GenerateNewFile(&rnd, &key_index);
-    db_->GetColumnFamilyMetaData(&cf_meta);
-    CheckColumnFamilyMeta(cf_meta);
+    // Add a single blob reference to each file
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
+                          /* offset */ 1234, /* size */ 5678, kNoCompression);
+
+    WriteBatch batch;
+    ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
+                                               blob_index));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+    ++key_index;
+
+    // Fill up the rest of the file with random values.
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+
+    Flush();
   }
+
+  std::vector<std::vector<FileMetaData>> files_by_level;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
+
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  CheckColumnFamilyMeta(cf_meta, files_by_level, start_time, end_time);
+
+  std::vector<LiveFileMetaData> live_file_meta;
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  CheckLiveFilesMeta(live_file_meta, files_by_level);
 }
 
 namespace {
@@ -1257,6 +1368,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   options.compression = kNoCompression;
   options.create_if_missing = true;
   DestroyAndReopen(options);
+  auto default_cf = db_->DefaultColumnFamily();
 
   const int N = 128;
   Random rnd(301);
@@ -1268,9 +1380,10 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   std::string start = Key(50);
   std::string end = Key(60);
   Range r(start, end);
-  uint8_t include_both = DB::SizeApproximationFlags::INCLUDE_FILES |
-                         DB::SizeApproximationFlags::INCLUDE_MEMTABLES;
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtabtles = true;
+  size_approx_options.include_files = true;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_GT(size, 6000);
   ASSERT_LT(size, 204800);
   // Zero if not including mem table
@@ -1280,7 +1393,7 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(500);
   end = Key(600);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_EQ(size, 0);
 
   for (int i = 0; i < N; i++) {
@@ -1290,19 +1403,20 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(500);
   end = Key(600);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_EQ(size, 0);
 
   start = Key(100);
   end = Key(1020);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_GT(size, 6000);
 
   options.max_write_buffer_number = 8;
   options.min_write_buffer_number_to_merge = 5;
   options.write_buffer_size = 1024 * N;  // Not very large
   DestroyAndReopen(options);
+  default_cf = db_->DefaultColumnFamily();
 
   int keys[N * 3];
   for (int i = 0; i < N; i++) {
@@ -1319,26 +1433,27 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(100);
   end = Key(300);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_EQ(size, 0);
 
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_GT(size, 6000);
 
   start = Key(2100);
   end = Key(2300);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
   ASSERT_EQ(size, 0);
 
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
   uint64_t size_with_mt, size_without_mt;
-  db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                           &size_with_mt);
   ASSERT_GT(size_with_mt, 6000);
   db_->GetApproximateSizes(&r, 1, &size_without_mt);
   ASSERT_EQ(size_without_mt, 0);
@@ -1352,10 +1467,80 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
-  db_->GetApproximateSizes(&r, 1, &size_with_mt, include_both);
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                           &size_with_mt);
   db_->GetApproximateSizes(&r, 1, &size_without_mt);
   ASSERT_GT(size_with_mt, size_without_mt);
   ASSERT_GT(size_without_mt, 6000);
+
+  // Check that include_memtabtles flag works as expected
+  size_approx_options.include_memtabtles = false;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_EQ(size, size_without_mt);
+
+  // Check that files_size_error_margin works as expected, when the heuristic
+  // conditions are not met
+  start = Key(1);
+  end = Key(1000 + N - 2);
+  r = Range(start, end);
+  size_approx_options.files_size_error_margin = -1.0;  // disabled
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  uint64_t size2;
+  size_approx_options.files_size_error_margin = 0.5;  // enabled, but not used
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+  ASSERT_EQ(size, size2);
+}
+
+TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024 * 1024;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.target_file_size_base = 1024 * 1024;
+  DestroyAndReopen(options);
+  const auto default_cf = db_->DefaultColumnFamily();
+
+  const int N = 64000;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
+  // Flush everything to files
+  Flush();
+  // Compact the entire key space into the next level
+  db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr);
+
+  // Write more keys
+  for (int i = N; i < (N + N / 4); i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
+  // Flush everything to files again
+  Flush();
+
+  // Wait for compaction to finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  const std::string start = Key(0);
+  const std::string end = Key(2 * N);
+  const Range r(start, end);
+
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtabtles = false;
+  size_approx_options.include_files = true;
+  size_approx_options.files_size_error_margin = -1.0;  // disabled
+
+  // Get the precise size without any approximation heuristic
+  uint64_t size;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_NE(size, 0);
+
+  // Get the size with an approximation heuristic
+  uint64_t size2;
+  const double error_margin = 0.2;
+  size_approx_options.files_size_error_margin = error_margin;
+  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+  ASSERT_LT(size2, size * (1 + error_margin));
+  ASSERT_GT(size2, size * (1 - error_margin));
 }
 
 TEST_F(DBTest, GetApproximateMemTableStats) {
@@ -2285,6 +2470,7 @@ class MultiThreadedDBTest
 };
 
 TEST_P(MultiThreadedDBTest, MultiThreaded) {
+  if (option_config_ == kPipelinedWrite) return;
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
   Options options = CurrentOptions(options_override);
@@ -2465,6 +2651,15 @@ class ModelDB : public DB {
     return Status::NotSupported(key);
   }
 
+  using DB::GetMergeOperands;
+  virtual Status GetMergeOperands(
+      const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
+      const Slice& key, PinnableSlice* /*slice*/,
+      GetMergeOperandsOptions* /*merge_operands_options*/,
+      int* /*number_of_operands*/) override {
+    return Status::NotSupported(key);
+  }
+
   using DB::MultiGet;
   std::vector<Status> MultiGet(
       const ReadOptions& /*options*/,
@@ -2491,7 +2686,18 @@ class ModelDB : public DB {
     return Status::NotSupported("Not implemented");
   }
 
-  Status VerifyChecksum() override {
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::VerifyChecksum;
+  Status VerifyChecksum(const ReadOptions&) override {
     return Status::NotSupported("Not implemented.");
   }
 
@@ -2587,13 +2793,14 @@ class ModelDB : public DB {
     return false;
   }
   using DB::GetApproximateSizes;
-  void GetApproximateSizes(ColumnFamilyHandle* /*column_family*/,
-                           const Range* /*range*/, int n, uint64_t* sizes,
-                           uint8_t /*include_flags*/
-                           = INCLUDE_FILES) override {
+  Status GetApproximateSizes(const SizeApproximationOptions& /*options*/,
+                             ColumnFamilyHandle* /*column_family*/,
+                             const Range* /*range*/, int n,
+                             uint64_t* sizes) override {
     for (int i = 0; i < n; i++) {
       sizes[i] = 0;
     }
+    return Status::OK();
   }
   using DB::GetApproximateMemTableStats;
   void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/,
@@ -2641,6 +2848,10 @@ class ModelDB : public DB {
     return Status::NotSupported("Not supported operation.");
   }
 
+  void EnableManualCompaction() override { return; }
+
+  void DisableManualCompaction() override { return; }
+
   using DB::NumberLevels;
   int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
 
@@ -2693,6 +2904,16 @@ class ModelDB : public DB {
     return Status::OK();
   }
 
+  Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* /*current_log_file*/) override {
+    return Status::OK();
+  }
+
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* /*creation_time*/) override {
+    return Status::NotSupported();
+  }
+
   Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
 
   Status GetUpdatesSince(
@@ -3114,10 +3335,10 @@ TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
   options.create_if_missing = true;
   options.ttl = 600;  // seconds
 
-  // Check that it is not supported with max_open_files != -1.
+  // TTL is now supported with max_open_files != -1.
   options.max_open_files = 100;
   options = CurrentOptions(options);
-  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+  ASSERT_OK(TryReopen(options));
 
   options.max_open_files = -1;
   ASSERT_OK(TryReopen(options));
@@ -4624,6 +4845,7 @@ TEST_F(DBTest, DynamicCompactionOptions) {
 // Even more FIFOCompactionTests are at DBTest.FIFOCompaction* .
 TEST_F(DBTest, DynamicFIFOCompactionOptions) {
   Options options;
+  options.ttl = 0;
   options.create_if_missing = true;
   DestroyAndReopen(options);
 
@@ -4689,15 +4911,15 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) {
   DestroyAndReopen(options);
 
   // Initial defaults
-  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
-            2);
+            2u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
             UINT_MAX);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.max_size_amplification_percent,
-            200);
+            200u);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.compression_size_percent,
@@ -4710,15 +4932,15 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) {
 
   ASSERT_OK(dbfull()->SetOptions(
       {{"compaction_options_universal", "{size_ratio=7;}"}}));
-  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
-            2);
+            2u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
             UINT_MAX);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.max_size_amplification_percent,
-            200);
+            200u);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.compression_size_percent,
@@ -4731,15 +4953,15 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) {
 
   ASSERT_OK(dbfull()->SetOptions(
       {{"compaction_options_universal", "{min_merge_width=11;}"}}));
-  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
-            11);
+            11u);
   ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
             UINT_MAX);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.max_size_amplification_percent,
-            200);
+            200u);
   ASSERT_EQ(dbfull()
                 ->GetOptions()
                 .compaction_options_universal.compression_size_percent,
@@ -4884,11 +5106,15 @@ TEST_F(DBTest, DynamicMiscOptions) {
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
                                                      &mutable_cf_options));
   ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression);
-  ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
-  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
-                                                     &mutable_cf_options));
-  ASSERT_EQ(CompressionType::kSnappyCompression,
-            mutable_cf_options.compression);
+
+  if (Snappy_Supported()) {
+    ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
+    ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+                                                       &mutable_cf_options));
+    ASSERT_EQ(CompressionType::kSnappyCompression,
+              mutable_cf_options.compression);
+  }
+
   // Test paranoid_file_checks already done in db_block_cache_test
   ASSERT_OK(
       dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}}));
@@ -5973,6 +6199,19 @@ TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
   }
 }
 
+TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
+  Options options = CurrentOptions();
+  options.max_open_files = 100;
+  Reopen(options);
+
+  ColumnFamilyOptions cf_options(options);
+  // ttl is now supported when max_open_files is -1.
+  cf_options.ttl = 3600;
+  ColumnFamilyHandle* handle;
+  ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
+  delete handle;
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, RowCache) {
   Options options = CurrentOptions();
@@ -6138,10 +6377,140 @@ TEST_F(DBTest, ThreadLocalPtrDeadlock) {
   fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n",
           flushes_done.load(), threads_destroyed.load());
 }
+
+TEST_F(DBTest, LargeBlockSizeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(0, "foo", "bar"));
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 8LL * 1024 * 1024 * 1024LL;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, CreationTimeOfOldestFile) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  bool set_file_creation_time_to_zero = true;
+  int idx = 0;
+
+  int64_t time_1 = 0;
+  env_->GetCurrentTime(&time_1);
+  const uint64_t uint_time_1 = static_cast<uint64_t>(time_1);
+
+  // Add 50 hours
+  env_->addon_time_.fetch_add(50 * 60 * 60);
+
+  int64_t time_2 = 0;
+  env_->GetCurrentTime(&time_2);
+  const uint64_t uint_time_2 = static_cast<uint64_t>(time_2);
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+        TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+        if (set_file_creation_time_to_zero) {
+          if (idx == 0) {
+            props->file_creation_time = 0;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_1;
+            idx = 0;
+          }
+        } else {
+          if (idx == 0) {
+            props->file_creation_time = uint_time_1;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_2;
+          }
+        }
+      });
+  // Set file creation time in manifest all to 0.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FileMetaData::FileMetaData", [&](void* arg) {
+        FileMetaData* meta = static_cast<FileMetaData*>(arg);
+        meta->file_creation_time = 0;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+
+  // At this point there should be 2 files, one with file_creation_time = 0 and
+  // the other non-zero. GetCreationTimeOfOldestFile API should return 0.
+  uint64_t creation_time;
+  Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);
+  ASSERT_EQ(0, creation_time);
+  ASSERT_EQ(s1, Status::OK());
+
+  // Testing with non-zero file creation time.
+  set_file_creation_time_to_zero = false;
+  options = CurrentOptions();
+  options.max_open_files = -1;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+
+  // At this point there should be 2 files with non-zero file creation time.
+  // GetCreationTimeOfOldestFile API should return non-zero value.
+  uint64_t ctime;
+  Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(uint_time_1, ctime);
+  ASSERT_EQ(s2, Status::OK());
+
+  // Testing with max_open_files != -1
+  options = CurrentOptions();
+  options.max_open_files = 10;
+  DestroyAndReopen(options);
+  Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(s3, Status::NotSupported());
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif
+
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
   rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 75e7fe4abba..6b0ee157e4c 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -16,6 +16,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/wal_filter.h"
+#include "test_util/fault_injection_test_env.h"
 
 namespace rocksdb {
 
@@ -1036,8 +1037,7 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
   ASSERT_TRUE(index == keys_cf.size());
 }
 
-// Temporarily disable it because the test is flaky.
-TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
+TEST_F(DBTest2, PresetCompressionDict) {
   // Verifies that compression ratio improves when dictionary is enabled, and
   // improves even further when the dictionary is trained by ZSTD.
   const size_t kBlockSizeBytes = 4 << 10;
@@ -1046,7 +1046,8 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
   const int kNumL0Files = 5;
 
   Options options;
-  options.env = CurrentOptions().env; // Make sure to use any custom env that the test is configured with.
+  // Make sure to use any custom env that the test is configured with.
+  options.env = CurrentOptions().env;
   options.allow_concurrent_memtable_write = false;
   options.arena_block_size = kBlockSizeBytes;
   options.create_if_missing = true;
@@ -1072,10 +1073,19 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
     compression_types.push_back(kZSTD);
   }
 
+  enum DictionaryTypes : int {
+    kWithoutDict,
+    kWithDict,
+    kWithZSTDTrainedDict,
+    kDictEnd,
+  };
+
   for (auto compression_type : compression_types) {
     options.compression = compression_type;
-    size_t prev_out_bytes;
-    for (int i = 0; i < 3; ++i) {
+    size_t bytes_without_dict = 0;
+    size_t bytes_with_dict = 0;
+    size_t bytes_with_zstd_trained_dict = 0;
+    for (int i = kWithoutDict; i < kDictEnd; i++) {
       // First iteration: compress without preset dictionary
       // Second iteration: compress with preset dictionary
       // Third iteration (zstd only): compress with zstd-trained dictionary
@@ -1085,19 +1095,19 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
       // the non-first iterations, verify the data we get out is the same data
       // we put in.
       switch (i) {
-        case 0:
+        case kWithoutDict:
           options.compression_opts.max_dict_bytes = 0;
           options.compression_opts.zstd_max_train_bytes = 0;
           break;
-        case 1:
-          options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes;
+        case kWithDict:
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
           options.compression_opts.zstd_max_train_bytes = 0;
           break;
-        case 2:
+        case kWithZSTDTrainedDict:
           if (compression_type != kZSTD) {
             continue;
           }
-          options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes;
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
           options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
           break;
         default:
@@ -1129,23 +1139,32 @@ TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
       ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
       ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
 
-      size_t out_bytes = 0;
-      std::vector<std::string> files;
-      GetSstFiles(env_, dbname_, &files);
-      for (const auto& file : files) {
-        uint64_t curr_bytes;
-        env_->GetFileSize(dbname_ + "/" + file, &curr_bytes);
-        out_bytes += static_cast<size_t>(curr_bytes);
+      // Get the live sst files size
+      size_t total_sst_bytes = TotalSize(1);
+      if (i == kWithoutDict) {
+        bytes_without_dict = total_sst_bytes;
+      } else if (i == kWithDict) {
+        bytes_with_dict = total_sst_bytes;
+      } else if (i == kWithZSTDTrainedDict) {
+        bytes_with_zstd_trained_dict = total_sst_bytes;
       }
 
       for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
            j++) {
         ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
       }
-      if (i) {
-        ASSERT_GT(prev_out_bytes, out_bytes);
+      if (i == kWithDict) {
+        ASSERT_GT(bytes_without_dict, bytes_with_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a trained
+        // dictionary does not get as good a compression ratio as without
+        // training.
+        // But using a dictionary (with or without training) should always get
+        // better compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
+                    bytes_without_dict > bytes_with_zstd_trained_dict);
       }
-      prev_out_bytes = out_bytes;
+
       DestroyAndReopen(options);
     }
   }
@@ -1923,7 +1942,10 @@ TEST_F(DBTest2, TestPerfContextIterCpuTime) {
 }
 #endif  // OS_LINUX
 
-#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
+// GetUniqueIdFromFile is not implemented on these platforms. Persistent cache
+// breaks when that function is not implemented and no regular block cache is
+// provided.
+#if !defined(OS_SOLARIS) && !defined(OS_WIN)
 TEST_F(DBTest2, PersistentCache) {
   int num_iter = 80;
 
@@ -1987,7 +2009,7 @@ TEST_F(DBTest2, PersistentCache) {
     }
   }
 }
-#endif // !OS_SOLARIS
+#endif  // !defined(OS_SOLARIS) && !defined(OS_WIN)
 
 namespace {
 void CountSyncPoint() {
@@ -2384,6 +2406,215 @@ TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBTest2, PausingManualCompaction1) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate another file containing same keys
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+
+  int manual_compactions_paused = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
+        auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
+        ASSERT_FALSE(paused->load(std::memory_order_acquire));
+        paused->store(true, std::memory_order_release);
+        manual_compactions_paused += 1;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<std::string> files_before_compact, files_after_compact;
+  // Remember file name before compaction is triggered
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_before_compact.push_back(file.name);
+  }
+
+  // OK, now trigger a manual compaction
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // Wait for compactions to get scheduled and stopped
+  dbfull()->TEST_WaitForCompact(true);
+
+  // Get file names after compaction is stopped
+  files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  // Like nothing happened
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  ASSERT_EQ(manual_compactions_paused, 1);
+
+  manual_compactions_paused = 0;
+  // Now make sure CompactFiles also not run
+  dbfull()->CompactFiles(rocksdb::CompactionOptions(), files_before_compact, 0);
+  // Wait for manual compaction to get scheduled and finish
+  dbfull()->TEST_WaitForCompact(true);
+
+  files_meta.clear();
+  files_after_compact.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  // CompactFiles returns at entry point
+  ASSERT_EQ(manual_compactions_paused, 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// PausingManualCompaction does not affect auto compaction
+TEST_F(DBTest2, PausingManualCompaction2) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = false;
+
+  DestroyAndReopen(options);
+  dbfull()->DisableManualCompaction();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; i++) {
+    // Generate a file containing 10 keys.
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 50)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  ASSERT_EQ(files_meta.size(), 1);
+}
+
+TEST_F(DBTest2, PausingManualCompaction3) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+        }
+        Flush();
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      [&](void* /*arg*/) { run_manual_compactions++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->DisableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+  // As manual compaction disabled, not even reach sync point
+  ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  rocksdb::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+  dbfull()->EnableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction4) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), RandomString(&rnd, 50)));
+        }
+        Flush();
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
+        auto paused = reinterpret_cast<std::atomic<bool>*>(arg);
+        ASSERT_FALSE(paused->load(std::memory_order_acquire));
+        paused->store(true, std::memory_order_release);
+        run_manual_compactions++;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->EnableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_EQ(run_manual_compactions, 1);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  rocksdb::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2");
+  dbfull()->EnableManualCompaction();
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBTest2, OptimizeForPointLookup) {
   Options options = CurrentOptions();
   Close();
@@ -2779,8 +3010,12 @@ TEST_F(DBTest2, ReadCallbackTest) {
     ReadOptions roptions;
     TestReadCallback callback(seq);
     bool dont_care = true;
-    Status s = dbfull()->GetImpl(roptions, dbfull()->DefaultColumnFamily(), key,
-                                 &pinnable_val, &dont_care, &callback);
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = dbfull()->DefaultColumnFamily();
+    get_impl_options.value = &pinnable_val;
+    get_impl_options.value_found = &dont_care;
+    get_impl_options.callback = &callback;
+    Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
     ASSERT_TRUE(s.ok());
     // Assuming that after each Put the DB increased seq by one, the value and
     // seq number must be equal since we also inc value by 1 after each Put.
@@ -3738,10 +3973,256 @@ TEST_F(DBTest2, OldStatsInterface) {
   ASSERT_GT(dos->num_rt, 0);
   ASSERT_GT(dos->num_mt, 0);
 }
+
+TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
+  const Snapshot* ss = db_->GetSnapshot();
+
+  for (auto h : handles_) {
+    db_->DestroyColumnFamilyHandle(h);
+  }
+  handles_.clear();
+
+  ASSERT_NOK(db_->Close());
+  db_->ReleaseSnapshot(ss);
+  ASSERT_OK(db_->Close());
+  delete db_;
+  db_ = nullptr;
+}
+
+TEST_F(DBTest2, PrefixBloomReseek) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(Put("bbb1", ""));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  // Seeking into f1, the iterator will check bloom filter which returns the
+  // file iterator ot be invalidate, and the cursor will put into f2, with
+  // the next key to be "ddd0".
+  iter->Seek("bbb1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bbb1", iter->key().ToString());
+
+  // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
+  iter->Seek("ccc1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("ccc1", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_F(DBTest2, PrefixBloomFilteredOut) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  // Bloom filter is filterd out by f1.
+  // This is just one of several valid position following the contract.
+  // Postioning to ccc1 or ddd0 is also valid. This is just to validate
+  // the behavior of the current implementation. If underlying implementation
+  // changes, the test might fail here.
+  iter->Seek("bbb1");
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RowCacheSnapshot) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8 * 8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar1"));
+
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("foo2", "bar"));
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(Put("foo3", "bar"));
+  const Snapshot* s3 = db_->GetSnapshot();
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s2), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s3), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+
+  db_->ReleaseSnapshot(s1);
+  db_->ReleaseSnapshot(s2);
+  db_->ReleaseSnapshot(s3);
+}
+#endif  // ROCKSDB_LITE
+
+// When DB is reopened with multiple column families, the manifest file
+// is written after the first CF is flushed, and it is written again
+// after each flush. If DB crashes between the flushes, the flushed CF
+// flushed will pass the latest log file, and now we require it not
+// to be corrupted, and triggering a corruption report.
+// We need to fix the bug and enable the test.
+TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
+  const std::vector<std::string> sync_points = {
+      "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
+  for (const auto& test_sync_point : sync_points) {
+    Options options = CurrentOptions();
+    // First destroy original db to ensure a clean start.
+    DestroyAndReopen(options);
+    options.create_if_missing = true;
+    options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put(1, "foo", "bar"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put(1, "foo", "bar"));
+    // The value is large enough to be divided to two blocks.
+    std::string large_value(400, ' ');
+    ASSERT_OK(Put("foo1", large_value));
+    ASSERT_OK(Put("foo2", large_value));
+    Close();
+
+    // Corrupt the log file in the middle, so that it is not corrupted
+    // in the tail.
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    for (const auto& f : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) {
+        std::string fname = dbname_ + "/" + f;
+        std::string file_content;
+        ASSERT_OK(ReadFileToString(env_, fname, &file_content));
+        file_content[400] = 'h';
+        file_content[401] = 'a';
+        ASSERT_OK(WriteStringToFile(env_, file_content, fname));
+        break;
+      }
+    }
+
+    // Reopen and freeze the file system after the first manifest write.
+    FaultInjectionTestEnv fit_env(options.env);
+    options.env = &fit_env;
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        test_sync_point,
+        [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_NOK(TryReopenWithColumnFamilies(
+        {kDefaultColumnFamilyName, "pikachu"}, options));
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+    fit_env.SetFilesystemActive(true);
+    // If we continue using failure ingestion Env, it will conplain something
+    // when renaming current file, which is not expected. Need to investigate
+    // why.
+    options.env = env_;
+    ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+                                          options));
+  }
+}
+
+TEST_F(DBTest2, SeekFileRangeDeleteTail) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewCappedPrefixTransform(1));
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a", "a"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
+  ASSERT_OK(Put("b", "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("x", "a"));
+  ASSERT_OK(Put("z", "a"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  {
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    iter->Seek("e");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("x", iter->key().ToString());
+  }
+  db_->ReleaseSnapshot(s1);
+}
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
   rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index bee6b81d5dd..88f57275f31 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -10,6 +10,7 @@
 #include "db/db_test_util.h"
 #include "db/forward_iterator.h"
 #include "rocksdb/env_encryption.h"
+#include "rocksdb/utilities/object_registry.h"
 
 namespace rocksdb {
 
@@ -47,20 +48,30 @@ ROT13BlockCipher rot13Cipher_(16);
 #endif  // ROCKSDB_LITE
 
 DBTestBase::DBTestBase(const std::string path)
-    : mem_env_(!getenv("MEM_ENV") ? nullptr : new MockEnv(Env::Default())),
+    : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) {
+  Env* base_env = Env::Default();
 #ifndef ROCKSDB_LITE
-      encrypted_env_(
-          !getenv("ENCRYPTED_ENV")
-              ? nullptr
-              : NewEncryptedEnv(mem_env_ ? mem_env_ : Env::Default(),
-                                new CTREncryptionProvider(rot13Cipher_))),
-#else
-      encrypted_env_(nullptr),
-#endif  // ROCKSDB_LITE
-      env_(new SpecialEnv(encrypted_env_
-                              ? encrypted_env_
-                              : (mem_env_ ? mem_env_ : Env::Default()))),
-      option_config_(kDefault) {
+  const char* test_env_uri = getenv("TEST_ENV_URI");
+  if (test_env_uri) {
+    Status s = ObjectRegistry::NewInstance()->NewSharedObject(test_env_uri,
+                                                              &env_guard_);
+    base_env = env_guard_.get();
+    EXPECT_OK(s);
+    EXPECT_NE(Env::Default(), base_env);
+  }
+#endif  // !ROCKSDB_LITE
+  EXPECT_NE(nullptr, base_env);
+  if (getenv("MEM_ENV")) {
+    mem_env_ = new MockEnv(base_env);
+  }
+#ifndef ROCKSDB_LITE
+  if (getenv("ENCRYPTED_ENV")) {
+    encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env,
+                                     new CTREncryptionProvider(rot13Cipher_));
+  }
+#endif  // !ROCKSDB_LITE
+  env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_
+                                       : (mem_env_ ? mem_env_ : base_env));
   env_->SetBackgroundThreads(1, Env::LOW);
   env_->SetBackgroundThreads(1, Env::HIGH);
   dbname_ = test::PerThreadDBPath(env_, path);
@@ -341,6 +352,7 @@ Options DBTestBase::GetOptions(
       options.prefix_extractor.reset(NewFixedPrefixTransform(1));
       options.memtable_factory.reset(NewHashSkipListRepFactory(16));
       options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
       break;
     case kPlainTableFirstBytePrefix:
       options.table_factory.reset(new PlainTableFactory());
@@ -373,12 +385,14 @@ Options DBTestBase::GetOptions(
     case kVectorRep:
       options.memtable_factory.reset(new VectorRepFactory(100));
       options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
       break;
     case kHashLinkList:
       options.prefix_extractor.reset(NewFixedPrefixTransform(1));
       options.memtable_factory.reset(
           NewHashLinkListRepFactory(4, 0, 3, true, 4));
       options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
       break;
       case kDirectIO: {
         options.use_direct_reads = true;
@@ -540,6 +554,11 @@ Options DBTestBase::GetOptions(
       options.manual_wal_flush = true;
       break;
     }
+    case kUnorderedWrite: {
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    }
 
     default:
       break;
@@ -565,7 +584,8 @@ void DBTestBase::CreateColumnFamilies(const std::vector<std::string>& cfs,
   size_t cfi = handles_.size();
   handles_.resize(cfi + cfs.size());
   for (auto cf : cfs) {
-    ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]);
+    ASSERT_OK(s);
   }
 }
 
@@ -757,7 +777,8 @@ std::string DBTestBase::Get(int cf, const std::string& k,
 
 std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
                                               const std::vector<std::string>& k,
-                                              const Snapshot* snapshot) {
+                                              const Snapshot* snapshot,
+                                              const bool batched) {
   ReadOptions options;
   options.verify_checksums = true;
   options.snapshot = snapshot;
@@ -769,12 +790,30 @@ std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
     handles.push_back(handles_[cfs[i]]);
     keys.push_back(k[i]);
   }
-  std::vector<Status> s = db_->MultiGet(options, handles, keys, &result);
-  for (unsigned int i = 0; i < s.size(); ++i) {
-    if (s[i].IsNotFound()) {
-      result[i] = "NOT_FOUND";
-    } else if (!s[i].ok()) {
-      result[i] = s[i].ToString();
+  std::vector<Status> s;
+  if (!batched) {
+    s = db_->MultiGet(options, handles, keys, &result);
+    for (unsigned int i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      }
+    }
+  } else {
+    std::vector<PinnableSlice> pin_values(cfs.size());
+    result.resize(cfs.size());
+    s.resize(cfs.size());
+    db_->MultiGet(options, cfs.size(), handles.data(), keys.data(),
+                  pin_values.data(), s.data());
+    for (unsigned int i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      } else {
+        result[i].assign(pin_values[i].data(), pin_values[i].size());
+      }
     }
   }
   return result;
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 50109e0a406..c9678ee1c3d 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -8,12 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
 
 #include <fcntl.h>
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <algorithm>
 #include <map>
@@ -24,9 +21,10 @@
 #include <utility>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
 #include "memtable/hash_linklist_rep.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
@@ -40,19 +38,18 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/checkpoint.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
+#include "test_util/mock_time_env.h"
 #include "util/compression.h"
-#include "util/filename.h"
-#include "util/mock_time_env.h"
 #include "util/mutexlock.h"
 
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -140,6 +137,11 @@ class SpecialMemTableRep : public MemTableRep {
     memtable_->Insert(handle);
   }
 
+  void InsertConcurrently(KeyHandle handle) override {
+    num_entries_++;
+    memtable_->Insert(handle);
+  }
+
   // Returns true iff an entry that compares equal to key is in the list.
   virtual bool Contains(const char* key) const override {
     return memtable_->Contains(key);
@@ -688,6 +690,7 @@ class DBTestBase : public testing::Test {
     kPartitionedFilterWithNewTableReaderForCompactions,
     kUniversalSubcompactions,
     kxxHash64Checksum,
+    kUnorderedWrite,
     // This must be the last line
     kEnd,
   };
@@ -699,6 +702,7 @@ class DBTestBase : public testing::Test {
   MockEnv* mem_env_;
   Env* encrypted_env_;
   SpecialEnv* env_;
+  std::shared_ptr<Env> env_guard_;
   DB* db_;
   std::vector<ColumnFamilyHandle*> handles_;
 
@@ -846,7 +850,8 @@ class DBTestBase : public testing::Test {
 
   std::vector<std::string> MultiGet(std::vector<int> cfs,
                                     const std::vector<std::string>& k,
-                                    const Snapshot* snapshot = nullptr);
+                                    const Snapshot* snapshot,
+                                    const bool batched);
 
   std::vector<std::string> MultiGet(const std::vector<std::string>& k,
                                     const Snapshot* snapshot = nullptr);
@@ -983,6 +988,11 @@ class DBTestBase : public testing::Test {
   uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
     return options.statistics->getTickerCount(ticker_type);
   }
+
+  uint64_t TestGetAndResetTickerCount(const Options& options,
+                                      Tickers ticker_type) {
+    return options.statistics->getAndResetTickerCount(ticker_type);
+  }
 };
 
 }  // namespace rocksdb
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 2bd8af684e0..522f4a2d8b7 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -11,7 +11,7 @@
 #include "port/stack_trace.h"
 #if !defined(ROCKSDB_LITE)
 #include "rocksdb/utilities/table_properties_collectors.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
@@ -41,10 +41,9 @@ class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {
       DBTestUniversalCompactionBase("/db_universal_compaction_test") {}
 };
 
-class DBTestUniversalDeleteTrigCompaction : public DBTestBase {
+class DBTestUniversalCompaction2 : public DBTestBase {
  public:
-  DBTestUniversalDeleteTrigCompaction()
-      : DBTestBase("/db_universal_compaction_test") {}
+  DBTestUniversalCompaction2() : DBTestBase("/db_universal_compaction_test2") {}
 };
 
 namespace {
@@ -397,7 +396,7 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
   int total_picked_compactions = 0;
   int total_size_amp_compactions = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) {
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
         if (arg) {
           total_picked_compactions++;
           Compaction* c = static_cast<Compaction*>(arg);
@@ -441,18 +440,18 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
   ASSERT_EQ(dbfull()
                 ->GetOptions(handles_[1])
                 .compaction_options_universal.max_size_amplification_percent,
-            200);
+            200U);
   ASSERT_OK(dbfull()->SetOptions(handles_[1],
                                  {{"compaction_options_universal",
                                    "{max_size_amplification_percent=110;}"}}));
   ASSERT_EQ(dbfull()
                 ->GetOptions(handles_[1])
                 .compaction_options_universal.max_size_amplification_percent,
-            110);
+            110u);
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
                                                      &mutable_cf_options));
-  ASSERT_EQ(110, mutable_cf_options.compaction_options_universal
-                     .max_size_amplification_percent);
+  ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal
+                      .max_size_amplification_percent);
 
   dbfull()->TEST_WaitForCompact();
   // Verify that size amplification did happen
@@ -478,7 +477,7 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
   int total_picked_compactions = 0;
   int total_size_ratio_compactions = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) {
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
         if (arg) {
           total_picked_compactions++;
           Compaction* c = static_cast<Compaction*>(arg);
@@ -522,20 +521,22 @@ TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
   ASSERT_EQ(dbfull()
                 ->GetOptions(handles_[1])
                 .compaction_options_universal.min_merge_width,
-            2);
+            2u);
   ASSERT_EQ(dbfull()
                 ->GetOptions(handles_[1])
                 .compaction_options_universal.max_merge_width,
-            2);
+            2u);
   ASSERT_EQ(
       dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio,
-      100);
+      100u);
 
   ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
                                                      &mutable_cf_options));
-  ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100);
-  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width, 2);
-  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, 2);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width,
+            2u);
 
   dbfull()->TEST_WaitForCompact();
 
@@ -837,14 +838,14 @@ TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) {
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
       {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0",
         "BackgroundCallCompaction:0"},
-       {"UniversalCompactionPicker::PickCompaction:Return",
+       {"UniversalCompactionBuilder::PickCompaction:Return",
         "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"},
        {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2",
         "CompactionJob::Run():Start"}});
 
   int total_picked_compactions = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) {
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
         if (arg) {
           total_picked_compactions++;
         }
@@ -1913,7 +1914,7 @@ INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId,
                         ::testing::Combine(::testing::Values(1, 8),
                                            ::testing::Bool()));
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, BasicL0toL1) {
+TEST_F(DBTestUniversalCompaction2, BasicL0toL1) {
   const int kNumKeys = 3000;
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
@@ -1954,7 +1955,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, BasicL0toL1) {
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
 }
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, SingleLevel) {
+TEST_F(DBTestUniversalCompaction2, SingleLevel) {
   const int kNumKeys = 3000;
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
@@ -1993,7 +1994,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, SingleLevel) {
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 }
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, MultipleLevels) {
+TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
 
@@ -2065,7 +2066,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, MultipleLevels) {
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
 }
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, OverlappingL0) {
+TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
 
@@ -2105,7 +2106,7 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, OverlappingL0) {
   ASSERT_GT(NumTableFilesAtLevel(6), 0);
 }
 
-TEST_F(DBTestUniversalDeleteTrigCompaction, IngestBehind) {
+TEST_F(DBTestUniversalCompaction2, IngestBehind) {
   const int kNumKeys = 3000;
   const int kWindowSize = 100;
   const int kNumDelsTrigger = 90;
@@ -2148,6 +2149,96 @@ TEST_F(DBTestUniversalDeleteTrigCompaction, IngestBehind) {
   ASSERT_GT(NumTableFilesAtLevel(5), 0);
 }
 
+TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  options.compaction_filter_factory.reset(filter);
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  KeepFilter df;
+  options.compaction_filter_factory.reset();
+  options.compaction_filter = &df;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 60 * 24 * 60 * 60;
+  options.compaction_filter = nullptr;
+  Reopen(options);
+  ASSERT_EQ(60 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
+  Options opts = CurrentOptions();
+  opts.env = env_;
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.max_open_files = -1;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  opts.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+  opts.num_levels = 5;
+  env_->addon_time_.store(0);
+  Reopen(opts);
+
+  int periodic_compactions = 0;
+  int start_level = -1;
+  int output_level = -1;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return",
+      [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(arg != nullptr);
+        ASSERT_TRUE(compaction->compaction_reason() ==
+                    CompactionReason::kPeriodicCompaction);
+        start_level = compaction->start_level();
+        output_level = compaction->output_level();
+        periodic_compactions++;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Case 1: Oldest flushed file excceeds periodic compaction threshold.
+  ASSERT_OK(Put("foo", "bar"));
+  Flush();
+  ASSERT_EQ(0, periodic_compactions);
+  // Move clock forward so that the flushed file would qualify periodic
+  // compaction.
+  env_->addon_time_.store(48 * 60 * 60 + 100);
+
+  // Another flush would trigger compaction the oldest file.
+  ASSERT_OK(Put("foo", "bar2"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+
+  // Case 2: Oldest compacted file excceeds periodic compaction threshold
+  periodic_compactions = 0;
+  // A flush doesn't trigger a periodic compaction when threshold not hit
+  ASSERT_OK(Put("foo", "bar2"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, periodic_compactions);
+
+  // After periodic compaction threshold hits, a flush will trigger
+  // a compaction
+  ASSERT_OK(Put("foo", "bar2"));
+  env_->addon_time_.fetch_add(48 * 60 * 60 + 100);
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+}
+
 }  // namespace rocksdb
 
 #endif  // !defined(ROCKSDB_LITE)
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 78f72b4a0e7..4e0b08c9a5c 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -11,8 +11,8 @@
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 class DBWALTest : public DBTestBase {
@@ -569,6 +569,56 @@ TEST_F(DBWALTest, GetSortedWalFiles) {
   } while (ChangeWalOptions());
 }
 
+TEST_F(DBWALTest, GetCurrentWalFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+    std::unique_ptr<LogFile>* bad_log_file = nullptr;
+    ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file));
+
+    std::unique_ptr<LogFile> log_file;
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    // nothing has been written to the log yet
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_EQ(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // add some data and verify that the file size actually moves foward
+    ASSERT_OK(Put(0, "foo", "v1"));
+    ASSERT_OK(Put(0, "foo2", "v2"));
+    ASSERT_OK(Put(0, "foo3", "v3"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // force log files to cycle and add some more data, then check if
+    // log number moves forward
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+
+    ASSERT_OK(Put(0, "foo4", "v4"));
+    ASSERT_OK(Put(0, "foo5", "v5"));
+    ASSERT_OK(Put(0, "foo6", "v6"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+  } while (ChangeWalOptions());
+}
+
 TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) {
   // Test for regression of WAL cleanup missing files that don't contain data
   // for every column family.
@@ -824,7 +874,9 @@ class RecoveryTestHelper {
   // Create WAL files with values filled in
   static void FillData(DBWALTest* test, const Options& options,
                        const size_t wal_count, size_t* count) {
-    const ImmutableDBOptions db_options(options);
+    // Calling internal functions requires sanitized options.
+    Options sanitized_options = SanitizeOptions(test->dbname_, options);
+    const ImmutableDBOptions db_options(sanitized_options);
 
     *count = 0;
 
@@ -838,7 +890,8 @@ class RecoveryTestHelper {
 
     versions.reset(new VersionSet(test->dbname_, &db_options, env_options,
                                   table_cache.get(), &write_buffer_manager,
-                                  &write_controller));
+                                  &write_controller,
+                                  /*block_cache_tracer=*/nullptr));
 
     wal_manager.reset(new WalManager(db_options, env_options));
 
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index e6bab875114..9eca823c2b7 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -12,9 +12,9 @@
 #include "db/write_thread.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/db/dbformat.cc b/db/dbformat.cc
index cd2878198c4..a20e2a02d39 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -8,12 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/dbformat.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <stdio.h>
+#include <cinttypes>
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "util/coding.h"
@@ -163,9 +159,11 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
   }
 }
 
-LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
+                     const Slice* ts) {
   size_t usize = _user_key.size();
-  size_t needed = usize + 13;  // A conservative estimate
+  size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
+  size_t needed = usize + ts_sz + 13;  // A conservative estimate
   char* dst;
   if (needed <= sizeof(space_)) {
     dst = space_;
@@ -174,10 +172,14 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
   }
   start_ = dst;
   // NOTE: We don't support users keys of more than 2GB :)
-  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + 8));
+  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
   kstart_ = dst;
   memcpy(dst, _user_key.data(), usize);
   dst += usize;
+  if (nullptr != ts) {
+    memcpy(dst, ts->data(), ts_sz);
+    dst += ts_sz;
+  }
   EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
   dst += 8;
   end_ = dst;
diff --git a/db/dbformat.h b/db/dbformat.h
index c850adcb01a..090d8c133f3 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -14,6 +14,7 @@
 #include <utility>
 #include "db/lookup_key.h"
 #include "db/merge_context.h"
+#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
@@ -23,11 +24,16 @@
 #include "rocksdb/table.h"
 #include "rocksdb/types.h"
 #include "util/coding.h"
-#include "util/logging.h"
 #include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
 
+// The file declares data structures and functions that deal with internal
+// keys.
+// Each internal key contains a user key, a sequence number (SequenceNumber)
+// and a type (ValueType), and they are usually encoded together.
+// There are some related helper classes here.
+
 class InternalKey;
 
 // Value types encoded as the last component of internal keys.
@@ -88,6 +94,8 @@ static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
 
 static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64;
 
+// The data structure that represents an internal key in the way that user_key,
+// sequence number and type are stored in separated forms.
 struct ParsedInternalKey {
   Slice user_key;
   SequenceNumber sequence;
@@ -143,6 +151,17 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
   return Slice(internal_key.data(), internal_key.size() - 8);
 }
 
+inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
+                                             size_t ts_sz) {
+  assert(internal_key.size() >= 8 + ts_sz);
+  return Slice(internal_key.data(), internal_key.size() - 8 - ts_sz);
+}
+
+inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+  assert(user_key.size() >= ts_sz);
+  return Slice(user_key.data(), user_key.size() - ts_sz);
+}
+
 inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
   assert(internal_key.size() >= 8);
   const size_t n = internal_key.size();
@@ -192,9 +211,7 @@ class InternalKeyComparator
   }
 };
 
-// Modules in this directory should keep internal keys wrapped inside
-// the following class instead of plain strings so that we do not
-// incorrectly use string comparisons instead of an InternalKeyComparator.
+// The class represent the internal key in encoded form.
 class InternalKey {
  private:
   std::string rep_;
@@ -295,6 +312,12 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
   return num >> 8;
 }
 
+// The class to store keys in an efficient way. It allows:
+// 1. Users can either copy the key into it, or have it point to an unowned
+//    address.
+// 2. For copied key, a short inline buffer is kept to reduce memory
+//    allocation for smaller keys.
+// 3. It tracks user key or internal key, and allow conversion between them.
 class IterKey {
  public:
   IterKey()
@@ -303,6 +326,9 @@ class IterKey {
         key_size_(0),
         buf_size_(sizeof(space_)),
         is_user_key_(true) {}
+  // No copying allowed
+  IterKey(const IterKey&) = delete;
+  void operator=(const IterKey&) = delete;
 
   ~IterKey() { ResetBuffer(); }
 
@@ -500,12 +526,10 @@ class IterKey {
   }
 
   void EnlargeBuffer(size_t key_size);
-
-  // No copying allowed
-  IterKey(const IterKey&) = delete;
-  void operator=(const IterKey&) = delete;
 };
 
+// Convert from a SliceTranform of user keys, to a SliceTransform of
+// user keys.
 class InternalKeySliceTransform : public SliceTransform {
  public:
   explicit InternalKeySliceTransform(const SliceTransform* transform)
@@ -631,6 +655,7 @@ inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
   return r;
 }
 
+// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.
 struct ParsedInternalKeyComparator {
   explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
       : cmp(c) {}
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
index 0b16c13f573..9ec1bc34348 100644
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@@ -8,8 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/dbformat.h"
-#include "util/logging.h"
-#include "util/testharness.h"
+#include "logging/logging.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 3ae464c5842..db6f945a7db 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -13,72 +13,42 @@
 #include <map>
 #include <string>
 #include <vector>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
-#include "util/filename.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
-class DeleteFileTest : public testing::Test {
+class DeleteFileTest : public DBTestBase {
  public:
-  std::string dbname_;
-  Options options_;
-  DB* db_;
-  Env* env_;
-  int numlevels_;
-
-  DeleteFileTest() {
-    db_ = nullptr;
-    env_ = Env::Default();
-    options_.delete_obsolete_files_period_micros = 0;  // always do full purge
-    options_.enable_thread_tracking = true;
-    options_.write_buffer_size = 1024*1024*1000;
-    options_.target_file_size_base = 1024*1024*1000;
-    options_.max_bytes_for_level_base = 1024*1024*1000;
-    options_.WAL_ttl_seconds = 300; // Used to test log files
-    options_.WAL_size_limit_MB = 1024; // Used to test log files
-    dbname_ = test::PerThreadDBPath("deletefile_test");
-    options_.wal_dir = dbname_ + "/wal_files";
-
-    // clean up all the files that might have been there before
-    std::vector<std::string> old_files;
-    env_->GetChildren(dbname_, &old_files);
-    for (auto file : old_files) {
-      env_->DeleteFile(dbname_ + "/" + file);
-    }
-    env_->GetChildren(options_.wal_dir, &old_files);
-    for (auto file : old_files) {
-      env_->DeleteFile(options_.wal_dir + "/" + file);
-    }
-
-    DestroyDB(dbname_, options_);
-    numlevels_ = 7;
-    EXPECT_OK(ReopenDB(true));
-  }
-
-  Status ReopenDB(bool create) {
-    delete db_;
-    if (create) {
-      DestroyDB(dbname_, options_);
-    }
-    db_ = nullptr;
-    options_.create_if_missing = create;
-    Status s = DB::Open(options_, dbname_, &db_);
-    assert(db_);
-    return s;
-  }
-
-  void CloseDB() {
-    delete db_;
-    db_ = nullptr;
+  const int numlevels_;
+  const std::string wal_dir_;
+
+  DeleteFileTest()
+      : DBTestBase("/deletefile_test"),
+        numlevels_(7),
+        wal_dir_(dbname_ + "/wal_files") {}
+
+  void SetOptions(Options* options) {
+    assert(options);
+    options->delete_obsolete_files_period_micros = 0;  // always do full purge
+    options->enable_thread_tracking = true;
+    options->write_buffer_size = 1024 * 1024 * 1000;
+    options->target_file_size_base = 1024 * 1024 * 1000;
+    options->max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options->WAL_ttl_seconds = 300;     // Used to test log files
+    options->WAL_size_limit_MB = 1024;  // Used to test log files
+    options->wal_dir = wal_dir_;
   }
 
   void AddKeys(int numkeys, int startkey = 0) {
@@ -120,23 +90,20 @@ class DeleteFileTest : public testing::Test {
 
   void CreateTwoLevels() {
     AddKeys(50000, 10000);
-    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-    ASSERT_OK(dbi->TEST_FlushMemTable());
-    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     for (int i = 0; i < 2; ++i) {
-      ASSERT_OK(dbi->TEST_CompactRange(i, nullptr, nullptr));
+      ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr));
     }
 
     AddKeys(50000, 10000);
-    ASSERT_OK(dbi->TEST_FlushMemTable());
-    ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
-    ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
   }
 
-  void CheckFileTypeCounts(std::string& dir,
-                            int required_log,
-                            int required_sst,
-                            int required_manifest) {
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
     std::vector<std::string> filenames;
     env_->GetChildren(dir, &filenames);
 
@@ -167,6 +134,12 @@ class DeleteFileTest : public testing::Test {
 };
 
 TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   CreateTwoLevels();
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
@@ -208,15 +181,19 @@ TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
 
   // Lowest level file deletion should succeed.
   ASSERT_OK(db_->DeleteFile(level2file));
-
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   CreateTwoLevels();
   // there should be only one (empty) log file because CreateTwoLevels()
   // flushes the memtables to disk
-  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
   // 2 ssts, 1 manifest
   CheckFileTypeCounts(dbname_, 0, 2, 1);
   std::string first("0"), last("999999");
@@ -229,7 +206,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   CheckFileTypeCounts(dbname_, 0, 1, 1);
 
   // this time, we keep an iterator alive
-  ReopenDB(true);
+  Reopen(options);
   Iterator *itr = nullptr;
   CreateTwoLevels();
   itr = db_->NewIterator(ReadOptions());
@@ -239,11 +216,15 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   delete itr;
   // 1 sst after iterator deletion
   CheckFileTypeCounts(dbname_, 0, 1, 1);
-
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   std::string first("0"), last("999999");
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
@@ -253,9 +234,9 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
   // We keep an iterator alive
   Iterator* itr = nullptr;
   CreateTwoLevels();
-  ReadOptions options;
-  options.background_purge_on_iterator_cleanup = true;
-  itr = db_->NewIterator(options);
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  itr = db_->NewIterator(read_options);
   db_->CompactRange(compact_options, &first_slice, &last_slice);
   // 3 sst after compaction with live iterator
   CheckFileTypeCounts(dbname_, 0, 3, 1);
@@ -277,13 +258,19 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
   sleeping_task_after.WaitUntilDone();
   // 1 sst after iterator deletion
   CheckFileTypeCounts(dbname_, 0, 1, 1);
-
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   auto do_test = [&](bool bg_purge) {
     ColumnFamilyOptions co;
+    co.max_write_buffer_size_to_maintain =
+        static_cast<int64_t>(co.write_buffer_size);
     WriteOptions wo;
     FlushOptions fo;
     ColumnFamilyHandle* cfh = nullptr;
@@ -305,6 +292,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
                    &sleeping_task_after, Env::Priority::HIGH);
     // If background purge is enabled, the file should still be there.
     CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1);
+    TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1");
 
     // Execute background purges.
     sleeping_task_after.WakeUp();
@@ -318,19 +306,31 @@ TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
     do_test(false);
   }
 
-  options_.avoid_unnecessary_blocking_io = true;
-  ASSERT_OK(ReopenDB(false));
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DeleteFileTest::BackgroundPurgeCFDropTest:1",
+        "DBImpl::BGWorkPurge:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.avoid_unnecessary_blocking_io = true;
+  options.create_if_missing = false;
+  Reopen(options);
   {
     SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
     do_test(true);
   }
-
-  CloseDB();
 }
 
 // This test is to reproduce a bug that read invalid ReadOption in iterator
 // cleanup function
 TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   std::string first("0"), last("999999");
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
@@ -340,12 +340,13 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
   // We keep an iterator alive
   Iterator* itr = nullptr;
   CreateTwoLevels();
-  ReadOptions* options = new ReadOptions();
-  options->background_purge_on_iterator_cleanup = true;
-  itr = db_->NewIterator(*options);
-  // ReadOptions is deleted, but iterator cleanup function should not be
-  // affected
-  delete options;
+  {
+    ReadOptions read_options;
+    read_options.background_purge_on_iterator_cleanup = true;
+    itr = db_->NewIterator(read_options);
+    // ReadOptions is deleted, but iterator cleanup function should not be
+    // affected
+  }
 
   db_->CompactRange(compact_options, &first_slice, &last_slice);
   // 3 sst after compaction with live iterator
@@ -361,11 +362,15 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
   sleeping_task_after.WaitUntilDone();
   // 1 sst after iterator deletion
   CheckFileTypeCounts(dbname_, 0, 1, 1);
-
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   std::string first("0"), last("999999");
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
@@ -374,15 +379,16 @@ TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
 
   // We keep an iterator alive
   CreateTwoLevels();
-  ReadOptions options;
-  options.background_purge_on_iterator_cleanup = true;
-  Iterator* itr1 = db_->NewIterator(options);
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  Iterator* itr1 = db_->NewIterator(read_options);
   CreateTwoLevels();
-  Iterator* itr2 = db_->NewIterator(options);
+  Iterator* itr2 = db_->NewIterator(read_options);
   db_->CompactRange(compact_options, &first_slice, &last_slice);
   // 5 sst files after 2 compactions with 2 live iterators
   CheckFileTypeCounts(dbname_, 0, 5, 1);
 
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   // ~DBImpl should wait until all BGWorkPurge are finished
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
       {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"},
@@ -394,24 +400,29 @@ TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
   env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH);
   delete itr2;
   env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH);
-  CloseDB();
+  Close();
 
   TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose");
   // 1 sst after iterator deletion
   CheckFileTypeCounts(dbname_, 0, 1, 1);
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(DeleteFileTest, DeleteFileWithIterator) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   CreateTwoLevels();
-  ReadOptions options;
-  Iterator* it = db_->NewIterator(options);
+  ReadOptions read_options;
+  Iterator* it = db_->NewIterator(read_options);
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
 
-  std::string level2file = "";
+  std::string level2file;
 
-  ASSERT_EQ((int)metadata.size(), 2);
+  ASSERT_EQ(metadata.size(), static_cast<size_t>(2));
   if (metadata[0].level == 1) {
     level2file = metadata[1].name;
   } else {
@@ -430,10 +441,15 @@ TEST_F(DeleteFileTest, DeleteFileWithIterator) {
   }
   ASSERT_EQ(numKeysIterated, 50000);
   delete it;
-  CloseDB();
 }
 
 TEST_F(DeleteFileTest, DeleteLogFiles) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
   AddKeys(10, 0);
   VectorLogPtr logfiles;
   db_->GetSortedWalFiles(logfiles);
@@ -442,11 +458,11 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
   // Should not succeed because live logs are not allowed to be deleted
   std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
   ASSERT_EQ(alive_log->Type(), kAliveLogFile);
-  ASSERT_OK(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
   fprintf(stdout, "Deleting alive log file %s\n",
           alive_log->PathName().c_str());
   ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
-  ASSERT_OK(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
   logfiles.clear();
 
   // Call Flush to bring about a new working log file and add more keys
@@ -460,43 +476,36 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
   ASSERT_GT(logfiles.size(), 0UL);
   std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
   ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
-  ASSERT_OK(
-      env_->FileExists(options_.wal_dir + "/" + archived_log->PathName()));
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
   fprintf(stdout, "Deleting archived log file %s\n",
           archived_log->PathName().c_str());
   ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
-  ASSERT_EQ(Status::NotFound(), env_->FileExists(options_.wal_dir + "/" +
-                                                 archived_log->PathName()));
-  CloseDB();
+  ASSERT_EQ(Status::NotFound(),
+            env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
 }
 
 TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
-  CloseDB();
-  DBOptions db_options;
-  db_options.create_if_missing = true;
-  db_options.create_missing_column_families = true;
-  std::vector<ColumnFamilyDescriptor> column_families;
-  column_families.emplace_back();
-  column_families.emplace_back("new_cf", ColumnFamilyOptions());
-
-  std::vector<rocksdb::ColumnFamilyHandle*> handles;
-  rocksdb::DB* db;
-  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+  CreateAndReopenWithCF({"new_cf"}, options);
 
   Random rnd(5);
   for (int i = 0; i < 1000; ++i) {
-    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
-                      test::RandomKey(&rnd, 10)));
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
   }
-  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
   for (int i = 0; i < 1000; ++i) {
-    ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10),
-                      test::RandomKey(&rnd, 10)));
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
   }
-  ASSERT_OK(db->Flush(FlushOptions(), handles[1]));
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
 
   std::vector<LiveFileMetaData> metadata;
-  db->GetLiveFilesMetaData(&metadata);
+  db_->GetLiveFilesMetaData(&metadata);
   ASSERT_EQ(2U, metadata.size());
   ASSERT_EQ("new_cf", metadata[0].column_family_name);
   ASSERT_EQ("new_cf", metadata[1].column_family_name);
@@ -506,11 +515,11 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
   auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
                       ? metadata[0].name
                       : metadata[1].name;
-  ASSERT_TRUE(db->DeleteFile(new_file).IsInvalidArgument());
-  ASSERT_OK(db->DeleteFile(old_file));
+  ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument());
+  ASSERT_OK(db_->DeleteFile(old_file));
 
   {
-    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
     int count = 0;
     for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
       ASSERT_OK(itr->status());
@@ -519,13 +528,11 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
     ASSERT_EQ(count, 1000);
   }
 
-  delete handles[0];
-  delete handles[1];
-  delete db;
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options);
 
-  ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db));
   {
-    std::unique_ptr<Iterator> itr(db->NewIterator(ReadOptions(), handles[1]));
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
     int count = 0;
     for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
       ASSERT_OK(itr->status());
@@ -533,16 +540,22 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
     }
     ASSERT_EQ(count, 1000);
   }
-
-  delete handles[0];
-  delete handles[1];
-  delete db;
 }
 
 } //namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 
diff --git a/db/error_handler.cc b/db/error_handler.cc
index afec14edcbe..9e1bf5cc107 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -4,9 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #include "db/error_handler.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/event_helpers.h"
-#include "util/sst_file_manager_impl.h"
+#include "file/sst_file_manager_impl.h"
 
 namespace rocksdb {
 
diff --git a/db/error_handler_test.cc b/db/error_handler_test.cc
index d33e19df5d5..c18706fc28e 100644
--- a/db/error_handler_test.cc
+++ b/db/error_handler_test.cc
@@ -12,9 +12,9 @@
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/sst_file_manager.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 #endif
 
 namespace rocksdb {
diff --git a/db/event_helpers.cc b/db/event_helpers.cc
index f1b4b6417ed..f5345c75507 100644
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@@ -70,8 +70,8 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
     const std::vector<std::shared_ptr<EventListener>>& listeners,
     const std::string& db_name, const std::string& cf_name,
     const std::string& file_path, int job_id, const FileDescriptor& fd,
-    const TableProperties& table_properties, TableFileCreationReason reason,
-    const Status& s) {
+    uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+    TableFileCreationReason reason, const Status& s) {
   if (s.ok() && event_logger) {
     JSONWriter jwriter;
     AppendCurrentTime(&jwriter);
@@ -106,7 +106,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
               << "num_entries" << table_properties.num_entries
               << "num_deletions" << table_properties.num_deletions
               << "num_merge_operands" << table_properties.num_merge_operands
-              << "num_range_deletions" << table_properties.num_merge_operands
+              << "num_range_deletions" << table_properties.num_range_deletions
               << "format_version" << table_properties.format_version
               << "fixed_key_len" << table_properties.fixed_key_len
               << "filter_policy" << table_properties.filter_policy_name
@@ -129,6 +129,11 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
       }
       jwriter.EndObject();
     }
+
+    if (oldest_blob_file_number != kInvalidBlobFileNumber) {
+      jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
+    }
+
     jwriter.EndObject();
 
     event_logger->Log(jwriter);
diff --git a/db/event_helpers.h b/db/event_helpers.h
index ea35b4b5b19..820eb09be4b 100644
--- a/db/event_helpers.h
+++ b/db/event_helpers.h
@@ -10,9 +10,9 @@
 
 #include "db/column_family.h"
 #include "db/version_edit.h"
+#include "logging/event_logger.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/table_properties.h"
-#include "util/event_logger.h"
 
 namespace rocksdb {
 
@@ -34,8 +34,8 @@ class EventHelpers {
       const std::vector<std::shared_ptr<EventListener>>& listeners,
       const std::string& db_name, const std::string& cf_name,
       const std::string& file_path, int job_id, const FileDescriptor& fd,
-      const TableProperties& table_properties, TableFileCreationReason reason,
-      const Status& s);
+      uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+      TableFileCreationReason reason, const Status& s);
   static void LogAndNotifyTableFileDeletion(
       EventLogger* event_logger, int job_id,
       uint64_t file_number, const std::string& file_path,
diff --git a/db/experimental.cc b/db/experimental.cc
index d509a37bf2e..0c3c3335d92 100644
--- a/db/experimental.cc
+++ b/db/experimental.cc
@@ -5,7 +5,7 @@
 
 #include "rocksdb/experimental.h"
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 namespace experimental {
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 256db0728bf..43a003a85cc 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -9,7 +9,8 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_writer.h"
-#include "util/testutil.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
@@ -20,6 +21,7 @@ class ExternalSSTFileBasicTest
  public:
   ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") {
     sst_files_dir_ = dbname_ + "/sst_files/";
+    fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default()));
     DestroyAndRecreateExternalSSTFilesDir();
   }
 
@@ -140,6 +142,7 @@ class ExternalSSTFileBasicTest
 
  protected:
   std::string sst_files_dir_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
 };
 
 TEST_F(ExternalSSTFileBasicTest, Basic) {
@@ -689,6 +692,110 @@ TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = fault_injection_test_env_.get();
+
+  std::vector<std::pair<std::string, std::string>> test_cases = {
+      {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile",
+       "ExternalSstFileIngestionJob::AfterSyncIngestedFile"},
+      {"ExternalSstFileIngestionJob::BeforeSyncDir",
+       "ExternalSstFileIngestionJob::AfterSyncDir"},
+      {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno",
+       "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}};
+
+  for (size_t i = 0; i < test_cases.size(); i++) {
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(false);
+    });
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(true);
+    });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndReopen(options);
+    if (i == 2) {
+      ASSERT_OK(Put("foo", "v1"));
+    }
+
+    Options sst_file_writer_options;
+    std::unique_ptr<SstFileWriter> sst_file_writer(
+        new SstFileWriter(EnvOptions(), sst_file_writer_options));
+    std::string file_name =
+        sst_files_dir_ + "sync_failure_test_" + ToString(i) + ".sst";
+    ASSERT_OK(sst_file_writer->Open(file_name));
+    ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+    ASSERT_OK(sst_file_writer->Finish());
+
+    IngestExternalFileOptions ingest_opt;
+    if (i == 0) {
+      ingest_opt.move_files = true;
+    }
+    const Snapshot* snapshot = db_->GetSnapshot();
+    if (i == 2) {
+      ingest_opt.write_global_seqno = true;
+    }
+    ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok());
+    db_->ReleaseSnapshot(snapshot);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    Destroy(options);
+  }
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
+  Options options;
+  options.create_if_missing = true;
+  SpecialEnv senv(Env::Default());
+  options.env = &senv;
+  DestroyAndReopen(options);
+
+  Options sst_file_writer_options;
+  std::unique_ptr<SstFileWriter> sst_file_writer(
+      new SstFileWriter(EnvOptions(), sst_file_writer_options));
+  std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst";
+  ASSERT_OK(sst_file_writer->Open(file_name));
+  Random rnd(301);
+  std::string value = DBTestBase::RandomString(&rnd, 4000);
+  for (int i = 0; i < 5000; i++) {
+    ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+
+  // Ingest it once without verifying checksums to see the baseline
+  // preads.
+  IngestExternalFileOptions ingest_opt;
+  ingest_opt.move_files = false;
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  auto base_num_reads = senv.random_read_counter_.Read();
+  // Make sure the counter is enabled.
+  ASSERT_GT(base_num_reads, 0);
+
+  // Ingest again and observe the reads made for for readahead.
+  ingest_opt.move_files = false;
+  ingest_opt.verify_checksums_before_ingest = true;
+  ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024};
+
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0);
+
+  // The SST file is about 20MB. Readahead size is 2MB.
+  // Give a conservative 15 reads for metadata blocks, the number
+  // of random reads should be within 20 MB / 2MB + 15 = 25.
+  ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40);
+
+  Destroy(options);
+}
+
 TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   int kNumLevels = 7;
   Options options = CurrentOptions();
@@ -779,6 +886,48 @@ TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
 }
 
+TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
+  Options options = CurrentOptions();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file8.sst (delete 300 => 400)
+  std::string file8 = sst_files_dir_ + "file8.sst";
+  ASSERT_OK(sst_file_writer.Open(file8));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400)));
+  ExternalSstFileInfo file8_info;
+  Status s = sst_file_writer.Finish(&file8_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file8_info.file_path, file8);
+  ASSERT_EQ(file8_info.num_entries, 0);
+  ASSERT_EQ(file8_info.smallest_key, "");
+  ASSERT_EQ(file8_info.largest_key, "");
+  ASSERT_EQ(file8_info.num_range_del_entries, 1);
+  ASSERT_EQ(file8_info.smallest_range_del_key, Key(300));
+  ASSERT_EQ(file8_info.largest_range_del_key, Key(400));
+
+  // file9.sst (delete 400 => 500)
+  std::string file9 = sst_files_dir_ + "file9.sst";
+  ASSERT_OK(sst_file_writer.Open(file9));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+  ExternalSstFileInfo file9_info;
+  s = sst_file_writer.Finish(&file9_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file9_info.file_path, file9);
+  ASSERT_EQ(file9_info.num_entries, 0);
+  ASSERT_EQ(file9_info.smallest_key, "");
+  ASSERT_EQ(file9_info.largest_key, "");
+  ASSERT_EQ(file9_info.num_range_del_entries, 1);
+  ASSERT_EQ(file9_info.smallest_range_del_key, Key(400));
+  ASSERT_EQ(file9_info.largest_range_del_key, Key(500));
+
+  // Range deletion tombstones are exclusive on their end key, so these SSTs
+  // should not be considered as overlapping.
+  s = DeprecatedAddFile({file8, file9});
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
 TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) {
   bool change_checksum_called = false;
   const auto& change_checksum = [&](void* arg) {
@@ -921,6 +1070,47 @@ TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) {
   } while (ChangeOptionsForFileIngestionTest());
 }
 
+TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
+  Options options = CurrentOptions();
+
+  std::vector<std::string> files;
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    ASSERT_OK(sst_file_writer.Put("a", "z"));
+    ASSERT_OK(sst_file_writer.Put("i", "m"));
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+    files.push_back(std::move(file1));
+  }
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    ASSERT_OK(sst_file_writer.Put("i", "k"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    files.push_back(std::move(file2));
+  }
+
+  IngestExternalFileOptions ifo;
+  ASSERT_OK(db_->IngestExternalFile(files, ifo));
+  ASSERT_EQ(Get("a"), "z");
+  ASSERT_EQ(Get("i"), "k");
+
+  int total_keys = 0;
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    total_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(total_keys, 2);
+
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+}
+
 INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
                         testing::Values(std::make_tuple(true, true),
                                         std::make_tuple(true, false),
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 28b481678ab..fd79ff1d0c1 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -7,24 +7,22 @@
 
 #include "db/external_sst_file_ingestion_job.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
+#include <cinttypes>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
+#include "db/db_impl/db_impl.h"
 #include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
 #include "table/merging_iterator.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/table_builder.h"
-#include "util/file_reader_writer.h"
-#include "util/file_util.h"
+#include "test_util/sync_point.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -66,65 +64,107 @@ Status ExternalSstFileIngestionJob::Prepare(
     std::sort(
         sorted_files.begin(), sorted_files.end(),
         [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
-          return ucmp->Compare(info1->smallest_user_key,
-                               info2->smallest_user_key) < 0;
+          return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+                                   info2->smallest_internal_key) < 0;
         });
 
     for (size_t i = 0; i < num_files - 1; i++) {
-      if (ucmp->Compare(sorted_files[i]->largest_user_key,
-                        sorted_files[i + 1]->smallest_user_key) >= 0) {
-        return Status::NotSupported("Files have overlapping ranges");
+      if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+                            sorted_files[i + 1]->smallest_internal_key) >= 0) {
+        files_overlap_ = true;
+        break;
       }
     }
   }
 
+  if (ingestion_options_.ingest_behind && files_overlap_) {
+    return Status::NotSupported("Files have overlapping ranges");
+  }
+
   for (IngestedFileInfo& f : files_to_ingest_) {
     if (f.num_entries == 0 && f.num_range_deletions == 0) {
       return Status::InvalidArgument("File contain no entries");
     }
 
-    if (!f.smallest_internal_key().Valid() ||
-        !f.largest_internal_key().Valid()) {
+    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
       return Status::Corruption("Generated table have corrupted keys");
     }
   }
 
   // Copy/Move external files into DB
+  std::unordered_set<size_t> ingestion_path_ids;
   for (IngestedFileInfo& f : files_to_ingest_) {
     f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
-
+    f.copy_file = false;
     const std::string path_outside_db = f.external_file_path;
     const std::string path_inside_db =
         TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(),
                       f.fd.GetPathId());
-
     if (ingestion_options_.move_files) {
       status = env_->LinkFile(path_outside_db, path_inside_db);
-      if (status.IsNotSupported()) {
-        // Original file is on a different FS, use copy instead of hard linking
-        status = CopyFile(env_, path_outside_db, path_inside_db, 0,
-                          db_options_.use_fsync);
+      if (status.ok()) {
+        // It is unsafe to assume application had sync the file and file
+        // directory before ingest the file. For integrity of RocksDB we need
+        // to sync the file.
+        std::unique_ptr<WritableFile> file_to_sync;
+        status = env_->ReopenWritableFile(path_inside_db, &file_to_sync,
+                                          env_options_);
+        if (status.ok()) {
+          TEST_SYNC_POINT(
+              "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+          status = SyncIngestedFile(file_to_sync.get());
+          TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+          if (!status.ok()) {
+            ROCKS_LOG_WARN(db_options_.info_log,
+                           "Failed to sync ingested file %s: %s",
+                           path_inside_db.c_str(), status.ToString().c_str());
+          }
+        }
+      } else if (status.IsNotSupported() &&
+                 ingestion_options_.failed_move_fall_back_to_copy) {
+        // Original file is on a different FS, use copy instead of hard linking.
         f.copy_file = true;
-      } else {
-        f.copy_file = false;
       }
     } else {
+      f.copy_file = true;
+    }
+
+    if (f.copy_file) {
+      TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
+                               nullptr);
+      // CopyFile also sync the new file.
       status = CopyFile(env_, path_outside_db, path_inside_db, 0,
                         db_options_.use_fsync);
-      f.copy_file = true;
     }
     TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
     if (!status.ok()) {
       break;
     }
     f.internal_file_path = path_inside_db;
+    ingestion_path_ids.insert(f.fd.GetPathId());
+  }
+
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
+  if (status.ok()) {
+    for (auto path_id : ingestion_path_ids) {
+      status = directories_->GetDataDir(path_id)->Fsync();
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Failed to sync directory %" ROCKSDB_PRIszt
+                       " while ingest file: %s",
+                       path_id, status.ToString().c_str());
+        break;
+      }
+    }
   }
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
 
+  // TODO: The following is duplicated with Cleanup().
   if (!status.ok()) {
     // We failed, remove all files that we copied into the db
     for (IngestedFileInfo& f : files_to_ingest_) {
       if (f.internal_file_path.empty()) {
-        break;
+        continue;
       }
       Status s = env_->DeleteFile(f.internal_file_path);
       if (!s.ok()) {
@@ -142,8 +182,8 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
                                                SuperVersion* super_version) {
   autovector<Range> ranges;
   for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
-    ranges.emplace_back(file_to_ingest.smallest_user_key,
-                        file_to_ingest.largest_user_key);
+    ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
+                        file_to_ingest.largest_internal_key.user_key());
   }
   Status status =
       cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed);
@@ -176,7 +216,7 @@ Status ExternalSstFileIngestionJob::Run() {
   }
   // It is safe to use this instead of LastAllocatedSequence since we are
   // the only active writer, and hence they are equal
-  const SequenceNumber last_seqno = versions_->LastSequence();
+  SequenceNumber last_seqno = versions_->LastSequence();
   edit_.SetColumnFamily(cfd_->GetID());
   // The levels that the files will be ingested into
 
@@ -186,8 +226,8 @@ Status ExternalSstFileIngestionJob::Run() {
       status = CheckLevelForIngestedBehindFile(&f);
     } else {
       status = AssignLevelAndSeqnoForIngestedFile(
-         super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
-         &f, &assigned_seqno);
+          super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
+          last_seqno, &f, &assigned_seqno);
     }
     if (!status.ok()) {
       return status;
@@ -195,16 +235,30 @@ Status ExternalSstFileIngestionJob::Run() {
     status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
     TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
                              &assigned_seqno);
-    if (assigned_seqno == last_seqno + 1) {
-      consumed_seqno_ = true;
+    if (assigned_seqno > last_seqno) {
+      assert(assigned_seqno == last_seqno + 1);
+      last_seqno = assigned_seqno;
+      ++consumed_seqno_count_;
     }
     if (!status.ok()) {
       return status;
     }
+
+    // We use the import time as the ancester time. This is the time the data
+    // is written to the database.
+    int64_t temp_current_time = 0;
+    uint64_t current_time = kUnknownFileCreationTime;
+    uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+    if (env_->GetCurrentTime(&temp_current_time).ok()) {
+      current_time = oldest_ancester_time =
+          static_cast<uint64_t>(temp_current_time);
+    }
+
     edit_.AddFile(f.picked_level, f.fd.GetNumber(), f.fd.GetPathId(),
-                  f.fd.GetFileSize(), f.smallest_internal_key(),
-                  f.largest_internal_key(), f.assigned_seqno, f.assigned_seqno,
-                  false);
+                  f.fd.GetFileSize(), f.smallest_internal_key,
+                  f.largest_internal_key, f.assigned_seqno, f.assigned_seqno,
+                  false, kInvalidBlobFileNumber, oldest_ancester_time,
+                  current_time);
   }
   return status;
 }
@@ -214,6 +268,13 @@ void ExternalSstFileIngestionJob::UpdateStats() {
   uint64_t total_keys = 0;
   uint64_t total_l0_files = 0;
   uint64_t total_time = env_->NowMicros() - job_start_time_;
+
+  EventLoggerStream stream = event_logger_->Log();
+  stream << "event"
+         << "ingest_finished";
+  stream << "files_ingested";
+  stream.StartArray();
+
   for (IngestedFileInfo& f : files_to_ingest_) {
     InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1);
     stats.micros = total_time;
@@ -241,7 +302,18 @@ void ExternalSstFileIngestionJob::UpdateStats() {
         "(global_seqno=%" PRIu64 ")\n",
         f.external_file_path.c_str(), f.picked_level,
         f.internal_file_path.c_str(), f.assigned_seqno);
+    stream << "file" << f.internal_file_path << "level" << f.picked_level;
+  }
+  stream.EndArray();
+
+  stream << "lsm_state";
+  stream.StartArray();
+  auto vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
   }
+  stream.EndArray();
+
   cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
                                      total_keys);
   cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
@@ -255,6 +327,9 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
     // We failed to add the files to the database
     // remove all the files we copied
     for (IngestedFileInfo& f : files_to_ingest_) {
+      if (f.internal_file_path.empty()) {
+        continue;
+      }
       Status s = env_->DeleteFile(f.internal_file_path);
       if (!s.ok()) {
         ROCKS_LOG_WARN(db_options_.info_log,
@@ -262,7 +337,8 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
                        f.internal_file_path.c_str(), s.ToString().c_str());
       }
     }
-    consumed_seqno_ = false;
+    consumed_seqno_count_ = 0;
+    files_overlap_ = false;
   } else if (status.ok() && ingestion_options_.move_files) {
     // The files were moved and added successfully, remove original file links
     for (IngestedFileInfo& f : files_to_ingest_) {
@@ -311,7 +387,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   }
 
   if (ingestion_options_.verify_checksums_before_ingest) {
-    status = table_reader->VerifyChecksum();
+    // If customized readahead size is needed, we can pass a user option
+    // all the way to here. Right now we just rely on the default readahead
+    // to keep things simple.
+    ReadOptions ro;
+    ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
+    status = table_reader->VerifyChecksum(
+        ro, TableReaderCaller::kExternalSSTIngestion);
   }
   if (!status.ok()) {
     return status;
@@ -371,11 +453,16 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   // updating the block cache.
   ro.fill_cache = false;
   std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
-      ro, sv->mutable_cf_options.prefix_extractor.get()));
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
   std::unique_ptr<InternalIterator> range_del_iter(
       table_reader->NewRangeTombstoneIterator(ro));
 
   // Get first (smallest) and last (largest) key from file.
+  file_to_ingest->smallest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
+  file_to_ingest->largest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
   bool bounds_set = false;
   iter->SeekToFirst();
   if (iter->Valid()) {
@@ -385,7 +472,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
     if (key.sequence != 0) {
       return Status::Corruption("external file have non zero sequence number");
     }
-    file_to_ingest->smallest_user_key = key.user_key.ToString();
+    file_to_ingest->smallest_internal_key.SetFrom(key);
 
     iter->SeekToLast();
     if (!ParseInternalKey(iter->key(), &key)) {
@@ -394,7 +481,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
     if (key.sequence != 0) {
       return Status::Corruption("external file have non zero sequence number");
     }
-    file_to_ingest->largest_user_key = key.user_key.ToString();
+    file_to_ingest->largest_internal_key.SetFrom(key);
 
     bounds_set = true;
   }
@@ -410,13 +497,17 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       }
       RangeTombstone tombstone(key, range_del_iter->value());
 
-      if (!bounds_set || ucmp->Compare(tombstone.start_key_,
-                                       file_to_ingest->smallest_user_key) < 0) {
-        file_to_ingest->smallest_user_key = tombstone.start_key_.ToString();
+      InternalKey start_key = tombstone.SerializeKey();
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, start_key,
+                            file_to_ingest->smallest_internal_key) < 0) {
+        file_to_ingest->smallest_internal_key = start_key;
       }
-      if (!bounds_set || ucmp->Compare(tombstone.end_key_,
-                                       file_to_ingest->largest_user_key) > 0) {
-        file_to_ingest->largest_user_key = tombstone.end_key_.ToString();
+      InternalKey end_key = tombstone.SerializeEndKey();
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, end_key,
+                            file_to_ingest->largest_internal_key) > 0) {
+        file_to_ingest->largest_internal_key = end_key;
       }
       bounds_set = true;
     }
@@ -431,13 +522,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
 
 Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
     SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
-    IngestedFileInfo* file_to_ingest, SequenceNumber* assigned_seqno) {
+    SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
+    SequenceNumber* assigned_seqno) {
   Status status;
   *assigned_seqno = 0;
-  const SequenceNumber last_seqno = versions_->LastSequence();
   if (force_global_seqno) {
     *assigned_seqno = last_seqno + 1;
-    if (compaction_style == kCompactionStyleUniversal) {
+    if (compaction_style == kCompactionStyleUniversal || files_overlap_) {
       file_to_ingest->picked_level = 0;
       return status;
     }
@@ -457,9 +548,10 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
 
     if (vstorage->NumLevelFiles(lvl) > 0) {
       bool overlap_with_level = false;
-      status = sv->current->OverlapWithLevelIterator(ro, env_options_,
-          file_to_ingest->smallest_user_key, file_to_ingest->largest_user_key,
-          lvl, &overlap_with_level);
+      status = sv->current->OverlapWithLevelIterator(
+          ro, env_options_, file_to_ingest->smallest_internal_key.user_key(),
+          file_to_ingest->largest_internal_key.user_key(), lvl,
+          &overlap_with_level);
       if (!status.ok()) {
         return status;
       }
@@ -499,6 +591,12 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
       target_level = lvl;
     }
   }
+  // If files overlap, we have to ingest them at level 0 and assign the newest
+  // sequence number
+  if (files_overlap_) {
+    target_level = 0;
+    *assigned_seqno = last_seqno + 1;
+  }
  TEST_SYNC_POINT_CALLBACK(
       "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
       &overlap_with_db);
@@ -560,6 +658,18 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
       std::string seqno_val;
       PutFixed64(&seqno_val, seqno);
       status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val);
+      if (status.ok()) {
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
+        status = SyncIngestedFile(rwfile.get());
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
+        if (!status.ok()) {
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Failed to sync ingested file %s after writing global "
+                         "sequence number: %s",
+                         file_to_ingest->internal_file_path.c_str(),
+                         status.ToString().c_str());
+        }
+      }
       if (!status.ok()) {
         return status;
       }
@@ -580,8 +690,9 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
   }
 
   auto* vstorage = cfd_->current()->storage_info();
-  Slice file_smallest_user_key(file_to_ingest->smallest_user_key);
-  Slice file_largest_user_key(file_to_ingest->largest_user_key);
+  Slice file_smallest_user_key(
+      file_to_ingest->smallest_internal_key.user_key());
+  Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
 
   if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
                                &file_largest_user_key)) {
@@ -600,6 +711,16 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
   return true;
 }
 
+template <typename TWritableFile>
+Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
+  assert(file != nullptr);
+  if (db_options_.use_fsync) {
+    return file->Fsync();
+  } else {
+    return file->Sync();
+  }
+}
+
 }  // namespace rocksdb
 
 #endif  // !ROCKSDB_LITE
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index baa8e9f0f64..90b8326bbef 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -12,6 +12,7 @@
 #include "db/dbformat.h"
 #include "db/internal_stats.h"
 #include "db/snapshot_impl.h"
+#include "logging/event_logger.h"
 #include "options/db_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -20,13 +21,15 @@
 
 namespace rocksdb {
 
+class Directories;
+
 struct IngestedFileInfo {
   // External file path
   std::string external_file_path;
-  // Smallest user key in external file
-  std::string smallest_user_key;
-  // Largest user key in external file
-  std::string largest_user_key;
+  // Smallest internal key in external file
+  InternalKey smallest_internal_key;
+  // Largest internal key in external file
+  InternalKey largest_internal_key;
   // Sequence number for keys in external file
   SequenceNumber original_seqno;
   // Offset of the global sequence number field in the file, will
@@ -60,15 +63,6 @@ struct IngestedFileInfo {
   // ingestion_options.move_files is false by default, thus copy_file is true
   // by default.
   bool copy_file = true;
-
-  InternalKey smallest_internal_key() const {
-    return InternalKey(smallest_user_key, assigned_seqno,
-                       ValueType::kTypeValue);
-  }
-
-  InternalKey largest_internal_key() const {
-    return InternalKey(largest_user_key, assigned_seqno, ValueType::kTypeValue);
-  }
 };
 
 class ExternalSstFileIngestionJob {
@@ -77,7 +71,8 @@ class ExternalSstFileIngestionJob {
       Env* env, VersionSet* versions, ColumnFamilyData* cfd,
       const ImmutableDBOptions& db_options, const EnvOptions& env_options,
       SnapshotList* db_snapshots,
-      const IngestExternalFileOptions& ingestion_options)
+      const IngestExternalFileOptions& ingestion_options,
+      Directories* directories, EventLogger* event_logger)
       : env_(env),
         versions_(versions),
         cfd_(cfd),
@@ -85,8 +80,12 @@ class ExternalSstFileIngestionJob {
         env_options_(env_options),
         db_snapshots_(db_snapshots),
         ingestion_options_(ingestion_options),
+        directories_(directories),
+        event_logger_(event_logger),
         job_start_time_(env_->NowMicros()),
-        consumed_seqno_(false) {}
+        consumed_seqno_count_(0) {
+    assert(directories != nullptr);
+  }
 
   // Prepare the job by copying external files into the DB.
   Status Prepare(const std::vector<std::string>& external_files_paths,
@@ -119,8 +118,8 @@ class ExternalSstFileIngestionJob {
     return files_to_ingest_;
   }
 
-  // Whether to increment VersionSet's seqno after this job runs
-  bool ShouldIncrementLastSequence() const { return consumed_seqno_; }
+  // How many sequence numbers did we consume as part of the ingest job?
+  int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; }
 
  private:
   // Open the external file and populate `file_to_ingest` with all the
@@ -135,6 +134,7 @@ class ExternalSstFileIngestionJob {
   Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv,
                                             bool force_global_seqno,
                                             CompactionStyle compaction_style,
+                                            SequenceNumber last_seqno,
                                             IngestedFileInfo* file_to_ingest,
                                             SequenceNumber* assigned_seqno);
 
@@ -153,6 +153,10 @@ class ExternalSstFileIngestionJob {
   bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
                               int level);
 
+  // Helper method to sync given file.
+  template <typename TWritableFile>
+  Status SyncIngestedFile(TWritableFile* file);
+
   Env* env_;
   VersionSet* versions_;
   ColumnFamilyData* cfd_;
@@ -161,9 +165,14 @@ class ExternalSstFileIngestionJob {
   SnapshotList* db_snapshots_;
   autovector<IngestedFileInfo> files_to_ingest_;
   const IngestExternalFileOptions& ingestion_options_;
+  Directories* directories_;
+  EventLogger* event_logger_;
   VersionEdit edit_;
   uint64_t job_start_time_;
-  bool consumed_seqno_;
+  int consumed_seqno_count_;
+  // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
+  // ingested in L0
+  bool files_overlap_{false};
 };
 
 }  // namespace rocksdb
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index cbbb2fa2627..3a059773f33 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -7,15 +7,63 @@
 
 #include <functional>
 #include "db/db_test_util.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_writer.h"
-#include "util/fault_injection_test_env.h"
-#include "util/filename.h"
-#include "util/testutil.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
+// A test environment that can be configured to fail the Link operation.
+class ExternalSSTTestEnv : public EnvWrapper {
+ public:
+  ExternalSSTTestEnv(Env* t, bool fail_link)
+      : EnvWrapper(t), fail_link_(fail_link) {}
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    if (fail_link_) {
+      return Status::NotSupported("Link failed");
+    }
+    return target()->LinkFile(s, t);
+  }
+
+  void set_fail_link(bool fail_link) { fail_link_ = fail_link; }
+
+ private:
+  bool fail_link_;
+};
+
+class ExternSSTFileLinkFailFallbackTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternSSTFileLinkFailFallbackTest()
+      : DBTestBase("/external_sst_file_test"),
+        test_env_(new ExternalSSTTestEnv(env_, true)) {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    test::DestroyDir(env_, sst_files_dir_);
+    env_->CreateDir(sst_files_dir_);
+    options_ = CurrentOptions();
+    options_.disable_auto_compactions = true;
+    options_.env = test_env_;
+  }
+
+  void TearDown() override {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options_));
+    delete test_env_;
+    test_env_ = nullptr;
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  Options options_;
+  ExternalSSTTestEnv* test_env_;
+};
+
 class ExternalSSTFileTest
     : public DBTestBase,
       public ::testing::WithParamInterface<std::tuple<bool, bool>> {
@@ -648,10 +696,10 @@ TEST_F(ExternalSSTFileTest, AddList) {
     ASSERT_EQ(file6_info.smallest_range_del_key, Key(0));
     ASSERT_EQ(file6_info.largest_range_del_key, Key(100));
 
-    // file7.sst (delete 100 => 200)
+    // file7.sst (delete 99 => 201)
     std::string file7 = sst_files_dir_ + "file7.sst";
     ASSERT_OK(sst_file_writer.Open(file7));
-    ASSERT_OK(sst_file_writer.DeleteRange(Key(100), Key(200)));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201)));
     ExternalSstFileInfo file7_info;
     s = sst_file_writer.Finish(&file7_info);
     ASSERT_TRUE(s.ok()) << s.ToString();
@@ -660,8 +708,8 @@ TEST_F(ExternalSSTFileTest, AddList) {
     ASSERT_EQ(file7_info.smallest_key, "");
     ASSERT_EQ(file7_info.largest_key, "");
     ASSERT_EQ(file7_info.num_range_del_entries, 1);
-    ASSERT_EQ(file7_info.smallest_range_del_key, Key(100));
-    ASSERT_EQ(file7_info.largest_range_del_key, Key(200));
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(99));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(201));
 
     // list 1 has internal key range conflict
     std::vector<std::string> file_list0({file1, file2});
@@ -676,9 +724,7 @@ TEST_F(ExternalSSTFileTest, AddList) {
     // These lists of files have key ranges that overlap with each other
     s = DeprecatedAddFile(file_list1);
     ASSERT_FALSE(s.ok()) << s.ToString();
-    // Both of the following overlap on the end key of a range deletion
-    // tombstone. This is a limitation because these tombstones have exclusive
-    // end keys that should not count as overlapping with other keys.
+    // Both of the following overlap on the range deletion tombstone.
     s = DeprecatedAddFile(file_list4);
     ASSERT_FALSE(s.ok()) << s.ToString();
     s = DeprecatedAddFile(file_list5);
@@ -2014,17 +2060,23 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
 }
 
 /*
- * Test and verify the functionality of ingestion_options.move_files.
+ * Test and verify the functionality of ingestion_options.move_files and
+ * ingestion_options.failed_move_fall_back_to_copy
  */
-TEST_F(ExternalSSTFileTest, LinkExternalSst) {
-  Options options = CurrentOptions();
-  options.disable_auto_compactions = true;
-  DestroyAndReopen(options);
+TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
+  const bool fail_link = std::get<0>(GetParam());
+  const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
+  test_env_->set_fail_link(fail_link);
+  const EnvOptions env_options;
+  DestroyAndReopen(options_);
   const int kNumKeys = 10000;
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
 
   std::string file_path = sst_files_dir_ + "file1.sst";
   // Create SstFileWriter for default column family
-  SstFileWriter sst_file_writer(EnvOptions(), options);
+  SstFileWriter sst_file_writer(env_options, options_);
   ASSERT_OK(sst_file_writer.Open(file_path));
   for (int i = 0; i < kNumKeys; i++) {
     ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
@@ -2033,9 +2085,13 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
   uint64_t file_size = 0;
   ASSERT_OK(env_->GetFileSize(file_path, &file_size));
 
-  IngestExternalFileOptions ifo;
-  ifo.move_files = true;
-  ASSERT_OK(db_->IngestExternalFile({file_path}, ifo));
+  bool copyfile = false;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:CopyFile",
+      [&](void* /* arg */) { copyfile = true; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  const Status s = db_->IngestExternalFile({file_path}, ifo);
 
   ColumnFamilyHandleImpl* cfh =
       static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
@@ -2049,18 +2105,29 @@ TEST_F(ExternalSSTFileTest, LinkExternalSst) {
     bytes_copied += stats.bytes_written;
     bytes_moved += stats.bytes_moved;
   }
-  // If bytes_moved > 0, it means external sst resides on the same FS
-  // supporting hard link operation. Therefore,
-  // 0 bytes should be copied, and the bytes_moved == file_size.
-  // Otherwise, FS does not support hard link, or external sst file resides on
-  // a different file system, then the bytes_copied should be equal to
-  // file_size.
-  if (bytes_moved > 0) {
+
+  if (!fail_link) {
+    // Link operation succeeds. External SST should be moved.
+    ASSERT_OK(s);
     ASSERT_EQ(0, bytes_copied);
     ASSERT_EQ(file_size, bytes_moved);
+    ASSERT_FALSE(copyfile);
   } else {
-    ASSERT_EQ(file_size, bytes_copied);
+    // Link operation fails.
+    ASSERT_EQ(0, bytes_moved);
+    if (failed_move_fall_back_to_copy) {
+      ASSERT_OK(s);
+      // Copy file is true since a failed link falls back to copy file.
+      ASSERT_TRUE(copyfile);
+      ASSERT_EQ(file_size, bytes_copied);
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+      // Copy file is false since a failed link does not fall back to copy file.
+      ASSERT_FALSE(copyfile);
+      ASSERT_EQ(0, bytes_copied);
+    }
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
 class TestIngestExternalFileListener : public EventListener {
@@ -2300,10 +2367,11 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
       new FaultInjectionTestEnv(env_));
   Options options = CurrentOptions();
   options.env = fault_injection_env.get();
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2317,6 +2385,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2324,8 +2395,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
                                          -1, true, true_data);
   ASSERT_OK(s);
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(2, handles_.size());
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {
@@ -2357,10 +2429,11 @@ TEST_P(ExternalSSTFileTest,
 
   Options options = CurrentOptions();
   options.env = fault_injection_env.get();
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   const std::vector<std::map<std::string, std::string>> data_before_ingestion =
       {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}},
-       {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}}};
+       {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}},
+       {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}};
   for (size_t i = 0; i != handles_.size(); ++i) {
     int cf = static_cast<int>(i);
     const auto& orig_data = data_before_ingestion[i];
@@ -2373,6 +2446,7 @@ TEST_P(ExternalSSTFileTest,
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2386,6 +2460,8 @@ TEST_P(ExternalSSTFileTest,
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2439,10 +2515,11 @@ TEST_P(ExternalSSTFileTest,
   dbfull()->ReleaseSnapshot(read_opts.snapshot);
 
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
   // Should see consistent state after ingestion for all column families even
   // without snapshot.
-  ASSERT_EQ(2, handles_.size());
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {
@@ -2472,10 +2549,11 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
        "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"},
   });
   SyncPoint::GetInstance()->EnableProcessing();
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2489,6 +2567,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2508,8 +2589,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
 
   fault_injection_env->SetFilesystemActive(true);
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(2, handles_.size());
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {
@@ -2538,10 +2620,11 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
        "DBImpl::IngestExternalFiles:BeforeJobsRun:1"},
   });
   SyncPoint::GetInstance()->EnableProcessing();
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2555,6 +2638,8 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2574,8 +2659,9 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
 
   fault_injection_env->SetFilesystemActive(true);
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(2, handles_.size());
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {
@@ -2595,7 +2681,7 @@ TEST_P(ExternalSSTFileTest,
   Options options = CurrentOptions();
   options.env = fault_injection_env.get();
 
-  CreateAndReopenWithCF({"pikachu"}, options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
 
   SyncPoint::GetInstance()->ClearTrace();
   SyncPoint::GetInstance()->DisableProcessing();
@@ -2613,6 +2699,7 @@ TEST_P(ExternalSSTFileTest,
   std::vector<ColumnFamilyHandle*> column_families;
   column_families.push_back(handles_[0]);
   column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
   std::vector<IngestExternalFileOptions> ifos(column_families.size());
   for (auto& ifo : ifos) {
     ifo.allow_global_seqno = true;  // Always allow global_seqno
@@ -2626,6 +2713,8 @@ TEST_P(ExternalSSTFileTest,
       {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
   data.push_back(
       {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
@@ -2646,8 +2735,9 @@ TEST_P(ExternalSSTFileTest,
   fault_injection_env->DropUnsyncedFileData();
   fault_injection_env->SetFilesystemActive(true);
   Close();
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(2, handles_.size());
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
   int cf = 0;
   for (const auto& verify_map : true_data) {
     for (const auto& elem : verify_map) {
@@ -2660,12 +2750,38 @@ TEST_P(ExternalSSTFileTest,
   Destroy(options, true /* delete_cf_paths */);
 }
 
+TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) {
+  Options options = CurrentOptions();
+  // Use large buffer to avoid memtable flush
+  options.write_buffer_size = 1024 * 1024;
+  options.two_write_queues = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1"));
+
+  // Put one key which is overlap with keys in memtable.
+  // It will trigger flushing memtable and require this thread is
+  // currently at the front of the 2nd writer queue. We must make
+  // sure that it won't enter the 2nd writer queue for the second time.
+  std::vector<std::pair<std::string, std::string>> data;
+  data.push_back(std::make_pair("1001", "v2"));
+  GenerateAndAddExternalFile(options, data);
+}
+
 INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
                         testing::Values(std::make_tuple(false, false),
                                         std::make_tuple(false, true),
                                         std::make_tuple(true, false),
                                         std::make_tuple(true, true)));
 
+INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
+                        ExternSSTFileLinkFailFallbackTest,
+                        testing::Values(std::make_tuple(true, false),
+                                        std::make_tuple(true, true),
+                                        std::make_tuple(false, false)));
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 53de312c017..1d18569f2f4 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -11,22 +11,22 @@
 // the last "sync". It then checks for data loss errors by purposely dropping
 // file data (or entire files) not protected by a "sync".
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/log_format.h"
 #include "db/version_set.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
+#include "logging/logging.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
-#include "util/fault_injection_test_env.h"
-#include "util/filename.h"
-#include "util/logging.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/file_indexer.h b/db/file_indexer.h
index 1bef3aab0ca..2091f80292b 100644
--- a/db/file_indexer.h
+++ b/db/file_indexer.h
@@ -12,8 +12,8 @@
 #include <functional>
 #include <limits>
 #include <vector>
+#include "memory/arena.h"
 #include "port/port.h"
-#include "util/arena.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc
index 935a01ef8dd..6942aa682d6 100644
--- a/db/file_indexer_test.cc
+++ b/db/file_indexer_test.cc
@@ -7,14 +7,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include <string>
 #include "db/file_indexer.h"
+#include <string>
 #include "db/dbformat.h"
 #include "db/version_edit.h"
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/filename_test.cc b/db/filename_test.cc
index d6bde52834e..bc52e0eae64 100644
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
@@ -7,12 +7,12 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/filename.h"
+#include "file/filename.h"
 
 #include "db/dbformat.h"
+#include "logging/logging.h"
 #include "port/port.h"
-#include "util/logging.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 4226589e79d..bdb4c179bd8 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -9,11 +9,7 @@
 
 #include "db/flush_job.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <algorithm>
 #include <vector>
@@ -29,6 +25,11 @@
 #include "db/merge_context.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/event_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
@@ -38,20 +39,15 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
 #include "table/table_builder.h"
 #include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/event_logger.h"
-#include "util/file_util.h"
-#include "util/filename.h"
-#include "util/log_buffer.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -229,10 +225,12 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
   // This will release and re-acquire the mutex.
   Status s = WriteLevel0Table();
 
-  if (s.ok() &&
-      (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
-    s = Status::ShutdownInProgress(
-        "Database shutdown or Column family drop during flush");
+  if (s.ok() && cfd_->IsDropped()) {
+    s = Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((s.ok() || s.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_acquire)) {
+    s = Status::ShutdownInProgress("Database shutdown");
   }
 
   if (!s.ok()) {
@@ -243,7 +241,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
     s = cfd_->imm()->TryInstallMemtableFlushResults(
         cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
         meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
-        log_buffer_);
+        log_buffer_, &committed_flush_jobs_info_);
   }
 
   if (s.ok() && file_meta != nullptr) {
@@ -367,6 +365,11 @@ Status FlushJob::WriteLevel0Table() {
       uint64_t oldest_key_time =
           mems_.front()->ApproximateOldestKeyTime();
 
+      // It's not clear whether oldest_key_time is always available. In case
+      // it is not available, use current_time.
+      meta_.oldest_ancester_time = std::min(current_time, oldest_key_time);
+      meta_.file_creation_time = current_time;
+
       s = BuildTable(
           dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
           env_options_, cfd_->table_cache(), iter.get(),
@@ -394,7 +397,7 @@ Status FlushJob::WriteLevel0Table() {
     if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
       s = output_file_directory_->Fsync();
     }
-    TEST_SYNC_POINT("FlushJob::WriteLevel0Table");
+    TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
     db_mutex_->Lock();
   }
   base_->Unref();
@@ -410,8 +413,13 @@ Status FlushJob::WriteLevel0Table() {
     edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
                    meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
                    meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
-                   meta_.marked_for_compaction);
+                   meta_.marked_for_compaction, meta_.oldest_blob_file_number,
+                   meta_.oldest_ancester_time, meta_.file_creation_time);
   }
+#ifndef ROCKSDB_LITE
+  // Piggyback FlushJobInfo on the first first flushed memtable.
+  mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif  // !ROCKSDB_LITE
 
   // Note that here we treat flush as level 0 compaction in internal stats
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
@@ -426,4 +434,26 @@ Status FlushJob::WriteLevel0Table() {
   return s;
 }
 
+#ifndef ROCKSDB_LITE
+std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
+  db_mutex_->AssertHeld();
+  std::unique_ptr<FlushJobInfo> info(new FlushJobInfo{});
+  info->cf_id = cfd_->GetID();
+  info->cf_name = cfd_->GetName();
+
+  const uint64_t file_number = meta_.fd.GetNumber();
+  info->file_path =
+      MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number);
+  info->file_number = file_number;
+  info->oldest_blob_file_number = meta_.oldest_blob_file_number;
+  info->thread_id = db_options_.env->GetThreadID();
+  info->job_id = job_context_->job_id;
+  info->smallest_seqno = meta_.fd.smallest_seqno;
+  info->largest_seqno = meta_.fd.largest_seqno;
+  info->table_properties = table_properties_;
+  info->flush_reason = cfd_->GetFlushReason();
+  return info;
+}
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/db/flush_job.h b/db/flush_job.h
index c4081945623..b25aca3529c 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -11,10 +11,11 @@
 #include <atomic>
 #include <deque>
 #include <limits>
+#include <list>
 #include <set>
+#include <string>
 #include <utility>
 #include <vector>
-#include <string>
 
 #include "db/column_family.h"
 #include "db/dbformat.h"
@@ -28,16 +29,17 @@
 #include "db/version_edit.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
+#include "logging/event_logger.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/listener.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/transaction_log.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/autovector.h"
-#include "util/event_logger.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
 
@@ -79,14 +81,22 @@ class FlushJob {
   Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
              FileMetaData* file_meta = nullptr);
   void Cancel();
-  TableProperties GetTableProperties() const { return table_properties_; }
   const autovector<MemTable*>& GetMemTables() const { return mems_; }
 
+#ifndef ROCKSDB_LITE
+  std::list<std::unique_ptr<FlushJobInfo>>* GetCommittedFlushJobsInfo() {
+    return &committed_flush_jobs_info_;
+  }
+#endif  // !ROCKSDB_LITE
+
  private:
   void ReportStartedFlush();
   void ReportFlushInputSize(const autovector<MemTable*>& mems);
   void RecordFlushIOStats();
   Status WriteLevel0Table();
+#ifndef ROCKSDB_LITE
+  std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
+#endif  // !ROCKSDB_LITE
 
   const std::string& dbname_;
   ColumnFamilyData* cfd_;
@@ -131,6 +141,10 @@ class FlushJob {
   // In this case, only after all flush jobs succeed in flush can RocksDB
   // commit to the MANIFEST.
   const bool write_manifest_;
+  // The current flush job can commit flush result of a concurrent flush job.
+  // We collect FlushJobInfo of all jobs committed by current job and fire
+  // OnFlushCompleted for them.
+  std::list<std::unique_ptr<FlushJobInfo>> committed_flush_jobs_info_;
 
   // Variables below are set by PickMemTable():
   FileMetaData meta_;
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 199ed29cacc..fec7379427a 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -4,19 +4,22 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <algorithm>
+#include <array>
 #include <map>
 #include <string>
 
+#include "db/blob_index.h"
 #include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "db/flush_job.h"
 #include "db/version_set.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
-#include "util/file_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -35,7 +38,8 @@ class FlushJobTest : public testing::Test {
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_buffer_manager_,
-                                 &write_controller_)),
+                                 &write_controller_,
+                                 /*block_cache_tracer=*/nullptr)),
         shutting_down_(false),
         mock_table_factory_(new mock::MockTableFactory()) {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
@@ -54,7 +58,14 @@ class FlushJobTest : public testing::Test {
   }
 
   void NewDB() {
+    SetIdentityFile(env_, dbname_);
     VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
@@ -145,6 +156,7 @@ TEST_F(FlushJobTest, NonEmpty) {
   //   seqno [    1,    2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ]
   //   key   [ 1001, 1002 ... 9998, 9999,    0,    1,    2 ...  999 ]
   //   range-delete "9995" -> "9999" at seqno 10000
+  //   blob references with seqnos 10001..10006
   for (int i = 1; i < 10000; ++i) {
     std::string key(ToString((i + 1000) % 10000));
     std::string value("value" + key);
@@ -154,9 +166,43 @@ TEST_F(FlushJobTest, NonEmpty) {
       inserted_keys.insert({internal_key.Encode().ToString(), value});
     }
   }
-  new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
-  InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
-  inserted_keys.insert({internal_key.Encode().ToString(), "9999a"});
+
+  {
+    new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
+    InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
+    inserted_keys.insert({internal_key.Encode().ToString(), "9999a"});
+  }
+
+#ifndef ROCKSDB_LITE
+  // Note: the first two blob references will not be considered when resolving
+  // the oldest blob file referenced (the first one is inlined TTL, while the
+  // second one is TTL and thus points to a TTL blob file).
+  constexpr std::array<uint64_t, 6> blob_file_numbers{
+      kInvalidBlobFileNumber, 5, 103, 17, 102, 101};
+  for (size_t i = 0; i < blob_file_numbers.size(); ++i) {
+    std::string key(ToString(i + 10001));
+    std::string blob_index;
+    if (i == 0) {
+      BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL,
+                                  "foo");
+    } else if (i == 1) {
+      BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL,
+                               blob_file_numbers[i], /* offset */ i << 10,
+                               /* size */ i << 20, kNoCompression);
+    } else {
+      BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i],
+                            /* offset */ i << 10, /* size */ i << 20,
+                            kNoCompression);
+    }
+
+    const SequenceNumber seq(i + 10001);
+    new_mem->Add(seq, kTypeBlobIndex, key, blob_index);
+
+    InternalKey internal_key(key, seq, kTypeBlobIndex);
+    inserted_keys.emplace_hint(inserted_keys.end(),
+                               internal_key.Encode().ToString(), blob_index);
+  }
+#endif
 
   autovector<MemTable*> to_delete;
   cfd->imm()->Add(new_mem, &to_delete);
@@ -185,11 +231,14 @@ TEST_F(FlushJobTest, NonEmpty) {
   ASSERT_GT(hist.average, 0.0);
 
   ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
-  ASSERT_EQ(
-      "9999a",
-      file_meta.largest.user_key().ToString());  // range tombstone end key
+  ASSERT_EQ("9999a", file_meta.largest.user_key().ToString());
   ASSERT_EQ(1, file_meta.fd.smallest_seqno);
-  ASSERT_EQ(10000, file_meta.fd.largest_seqno);  // range tombstone seqnum 10000
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(10006, file_meta.fd.largest_seqno);
+  ASSERT_EQ(17, file_meta.oldest_blob_file_number);
+#else
+  ASSERT_EQ(10000, file_meta.fd.largest_seqno);
+#endif
   mock_table_factory_->AssertSingleFile(inserted_keys);
   job_context.Clean();
 }
@@ -252,6 +301,7 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
   ASSERT_EQ(0, file_meta.fd.smallest_seqno);
   ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
             file_meta.fd.largest_seqno);
+  ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number);
 
   for (auto m : to_delete) {
     delete m;
@@ -297,18 +347,18 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
 
   EventLogger event_logger(db_options_.info_log.get());
   SnapshotChecker* snapshot_checker = nullptr;  // not relevant
-  std::vector<FlushJob> flush_jobs;
+  std::vector<std::unique_ptr<FlushJob>> flush_jobs;
   k = 0;
   for (auto cfd : all_cfds) {
     std::vector<SequenceNumber> snapshot_seqs;
-    flush_jobs.emplace_back(
+    flush_jobs.emplace_back(new FlushJob(
         dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
         &memtable_ids[k], env_options_, versions_.get(), &mutex_,
         &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
         &job_context, nullptr, nullptr, nullptr, kNoCompression,
         db_options_.statistics.get(), &event_logger, true,
         false /* sync_output_directory */, false /* write_manifest */,
-        Env::Priority::USER);
+        Env::Priority::USER));
     k++;
   }
   HistogramData hist;
@@ -317,12 +367,12 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
   file_metas.reserve(flush_jobs.size());
   mutex_.Lock();
   for (auto& job : flush_jobs) {
-    job.PickMemTable();
+    job->PickMemTable();
   }
   for (auto& job : flush_jobs) {
     FileMetaData meta;
     // Run will release and re-acquire  mutex
-    ASSERT_OK(job.Run(nullptr /**/, &meta));
+    ASSERT_OK(job->Run(nullptr /**/, &meta));
     file_metas.emplace_back(meta);
   }
   autovector<FileMetaData*> file_meta_ptrs;
@@ -331,7 +381,7 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
   }
   autovector<const autovector<MemTable*>*> mems_list;
   for (size_t i = 0; i != all_cfds.size(); ++i) {
-    const auto& mems = flush_jobs[i].GetMemTables();
+    const auto& mems = flush_jobs[i]->GetMemTables();
     mems_list.push_back(&mems);
   }
   autovector<const MutableCFOptions*> mutable_cf_options_list;
diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc
index 8735a6b369b..cbcb5ce49f9 100644
--- a/db/flush_scheduler.cc
+++ b/db/flush_scheduler.cc
@@ -11,11 +11,13 @@
 
 namespace rocksdb {
 
-void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
+void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) {
 #ifndef NDEBUG
-  std::lock_guard<std::mutex> lock(checking_mutex_);
-  assert(checking_set_.count(cfd) == 0);
-  checking_set_.insert(cfd);
+  {
+    std::lock_guard<std::mutex> lock(checking_mutex_);
+    assert(checking_set_.count(cfd) == 0);
+    checking_set_.insert(cfd);
+  }
 #endif  // NDEBUG
   cfd->Ref();
 // Suppress false positive clang analyzer warnings.
@@ -32,9 +34,6 @@ void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
 }
 
 ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
-#ifndef NDEBUG
-  std::lock_guard<std::mutex> lock(checking_mutex_);
-#endif  // NDEBUG
   while (true) {
     if (head_.load(std::memory_order_relaxed) == nullptr) {
       return nullptr;
@@ -47,9 +46,12 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
     delete node;
 
 #ifndef NDEBUG
-    auto iter = checking_set_.find(cfd);
-    assert(iter != checking_set_.end());
-    checking_set_.erase(iter);
+    {
+      std::lock_guard<std::mutex> lock(checking_mutex_);
+      auto iter = checking_set_.find(cfd);
+      assert(iter != checking_set_.end());
+      checking_set_.erase(iter);
+    }
 #endif  // NDEBUG
 
     if (!cfd->IsDropped()) {
@@ -65,12 +67,12 @@ ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
 }
 
 bool FlushScheduler::Empty() {
-#ifndef NDEBUG
-  std::lock_guard<std::mutex> lock(checking_mutex_);
-#endif  // NDEBUG
   auto rv = head_.load(std::memory_order_relaxed) == nullptr;
 #ifndef NDEBUG
-  assert(rv == checking_set_.empty());
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  // Empty is allowed to be called concurrnetly with ScheduleFlush. It would
+  // only miss the recent schedules.
+  assert((rv == checking_set_.empty()) || rv);
 #endif  // NDEBUG
   return rv;
 }
diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h
index cd3575861a8..5ca85e88bcf 100644
--- a/db/flush_scheduler.h
+++ b/db/flush_scheduler.h
@@ -9,25 +9,31 @@
 #include <atomic>
 #include <mutex>
 #include <set>
+#include "util/autovector.h"
 
 namespace rocksdb {
 
 class ColumnFamilyData;
 
-// Unless otherwise noted, all methods on FlushScheduler should be called
-// only with the DB mutex held or from a single-threaded recovery context.
+// FlushScheduler keeps track of all column families whose memtable may
+// be full and require flushing. Unless otherwise noted, all methods on
+// FlushScheduler should be called only with the DB mutex held or from
+// a single-threaded recovery context.
 class FlushScheduler {
  public:
   FlushScheduler() : head_(nullptr) {}
 
   // May be called from multiple threads at once, but not concurrent with
   // any other method calls on this instance
-  void ScheduleFlush(ColumnFamilyData* cfd);
+  void ScheduleWork(ColumnFamilyData* cfd);
 
   // Removes and returns Ref()-ed column family. Client needs to Unref().
   // Filters column families that have been dropped.
   ColumnFamilyData* TakeNextColumnFamily();
 
+  // This can be called concurrently with ScheduleWork but it would miss all
+  // the scheduled flushes after the last synchronization. This would result
+  // into less precise enforcement of memtable sizes but should not matter much.
   bool Empty();
 
   void Clear();
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 94e448ee97d..c875008c769 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -11,7 +11,7 @@
 #include <utility>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/job_context.h"
@@ -21,8 +21,8 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -79,7 +79,11 @@ class ForwardLevelIterator : public InternalIterator {
         read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
         *files_[file_index_],
         read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
-        prefix_extractor_, nullptr /* table_reader_ptr */, nullptr, false);
+        prefix_extractor_, /*table_reader_ptr=*/nullptr,
+        /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator,
+        /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr);
     file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
     valid_ = false;
     if (!range_del_agg.IsEmpty()) {
@@ -642,7 +646,12 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
     l0_iters_.push_back(cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
         read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
-        sv_->mutable_cf_options.prefix_extractor.get()));
+        sv_->mutable_cf_options.prefix_extractor.get(),
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr));
   }
   BuildLevelIterators(vstorage);
   current_ = nullptr;
@@ -714,7 +723,12 @@ void ForwardIterator::RenewIterators() {
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
         *l0_files_new[inew],
         read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
-        svnew->mutable_cf_options.prefix_extractor.get()));
+        svnew->mutable_cf_options.prefix_extractor.get(),
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr));
   }
 
   for (auto* f : l0_iters_) {
@@ -772,8 +786,13 @@ void ForwardIterator::ResetIncompleteIterators() {
     DeleteIterator(l0_iters_[i]);
     l0_iters_[i] = cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
-        *l0_files[i], nullptr /* range_del_agg */,
-        sv_->mutable_cf_options.prefix_extractor.get());
+        *l0_files[i], /*range_del_agg=*/nullptr,
+        sv_->mutable_cf_options.prefix_extractor.get(),
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr);
     l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
   }
 
diff --git a/db/forward_iterator.h b/db/forward_iterator.h
index 146588d961c..fb73f458edd 100644
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@@ -10,12 +10,12 @@
 #include <vector>
 #include <queue>
 
+#include "db/dbformat.h"
+#include "memory/arena.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
-#include "db/dbformat.h"
 #include "table/internal_iterator.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 
diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc
index 113ded94b69..174a258a682 100644
--- a/db/forward_iterator_bench.cc
+++ b/db/forward_iterator_bench.cc
@@ -3,10 +3,6 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #if !defined(GFLAGS) || defined(ROCKSDB_LITE)
 #include <cstdio>
 int main() {
@@ -34,8 +30,8 @@ int main() { return 0; }
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
+#include "test_util/testharness.h"
 #include "util/gflags_compat.h"
-#include "util/testharness.h"
 
 const int MAX_SHARDS = 100000;
 
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
new file mode 100644
index 00000000000..f52418a0781
--- /dev/null
+++ b/db/import_column_family_job.cc
@@ -0,0 +1,269 @@
+#ifndef ROCKSDB_LITE
+
+#include "db/import_column_family_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
+                                      SuperVersion* sv) {
+  Status status;
+
+  // Read the information of files we are importing
+  for (const auto& file_metadata : metadata_) {
+    const auto file_path = file_metadata.db_path + "/" + file_metadata.name;
+    IngestedFileInfo file_to_import;
+    status = GetIngestedFileInfo(file_path, &file_to_import, sv);
+    if (!status.ok()) {
+      return status;
+    }
+    files_to_import_.push_back(file_to_import);
+  }
+
+  const auto ucmp = cfd_->internal_comparator().user_comparator();
+  auto num_files = files_to_import_.size();
+  if (num_files == 0) {
+    return Status::InvalidArgument("The list of files is empty");
+  } else if (num_files > 1) {
+    // Verify that passed files don't have overlapping ranges in any particular
+    // level.
+    int min_level = 1;  // Check for overlaps in Level 1 and above.
+    int max_level = -1;
+    for (const auto& file_metadata : metadata_) {
+      if (file_metadata.level > max_level) {
+        max_level = file_metadata.level;
+      }
+    }
+    for (int level = min_level; level <= max_level; ++level) {
+      autovector<const IngestedFileInfo*> sorted_files;
+      for (size_t i = 0; i < num_files; i++) {
+        if (metadata_[i].level == level) {
+          sorted_files.push_back(&files_to_import_[i]);
+        }
+      }
+
+      std::sort(sorted_files.begin(), sorted_files.end(),
+                [&ucmp](const IngestedFileInfo* info1,
+                        const IngestedFileInfo* info2) {
+                  return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+                                           info2->smallest_internal_key) < 0;
+                });
+
+      for (size_t i = 0; i < sorted_files.size() - 1; i++) {
+        if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+                              sorted_files[i + 1]->smallest_internal_key) >=
+            0) {
+          return Status::InvalidArgument("Files have overlapping ranges");
+        }
+      }
+    }
+  }
+
+  for (const auto& f : files_to_import_) {
+    if (f.num_entries == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
+      return Status::Corruption("File has corrupted keys");
+    }
+  }
+
+  // Copy/Move external files into DB
+  auto hardlink_files = import_options_.move_files;
+  for (auto& f : files_to_import_) {
+    f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
+
+    const auto path_outside_db = f.external_file_path;
+    const auto path_inside_db = TableFileName(
+        cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+
+    if (hardlink_files) {
+      status = env_->LinkFile(path_outside_db, path_inside_db);
+      if (status.IsNotSupported()) {
+        // Original file is on a different FS, use copy instead of hard linking
+        hardlink_files = false;
+      }
+    }
+    if (!hardlink_files) {
+      status = CopyFile(env_, path_outside_db, path_inside_db, 0,
+                        db_options_.use_fsync);
+    }
+    if (!status.ok()) {
+      break;
+    }
+    f.copy_file = !hardlink_files;
+    f.internal_file_path = path_inside_db;
+  }
+
+  if (!status.ok()) {
+    // We failed, remove all files that we copied into the db
+    for (const auto& f : files_to_import_) {
+      if (f.internal_file_path.empty()) {
+        break;
+      }
+      const auto s = env_->DeleteFile(f.internal_file_path);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+
+  return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ImportColumnFamilyJob::Run() {
+  Status status;
+  edit_.SetColumnFamily(cfd_->GetID());
+
+  // We use the import time as the ancester time. This is the time the data
+  // is written to the database.
+  int64_t temp_current_time = 0;
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+  uint64_t current_time = kUnknownOldestAncesterTime;
+  if (env_->GetCurrentTime(&temp_current_time).ok()) {
+    current_time = oldest_ancester_time =
+        static_cast<uint64_t>(temp_current_time);
+  }
+
+  for (size_t i = 0; i < files_to_import_.size(); ++i) {
+    const auto& f = files_to_import_[i];
+    const auto& file_metadata = metadata_[i];
+
+    edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
+                  f.fd.GetFileSize(), f.smallest_internal_key,
+                  f.largest_internal_key, file_metadata.smallest_seqno,
+                  file_metadata.largest_seqno, false, kInvalidBlobFileNumber,
+                  oldest_ancester_time, current_time);
+
+    // If incoming sequence number is higher, update local sequence number.
+    if (file_metadata.largest_seqno > versions_->LastSequence()) {
+      versions_->SetLastAllocatedSequence(file_metadata.largest_seqno);
+      versions_->SetLastPublishedSequence(file_metadata.largest_seqno);
+      versions_->SetLastSequence(file_metadata.largest_seqno);
+    }
+  }
+
+  return status;
+}
+
+void ImportColumnFamilyJob::Cleanup(const Status& status) {
+  if (!status.ok()) {
+    // We failed to add files to the database remove all the files we copied.
+    for (const auto& f : files_to_import_) {
+      const auto s = env_->DeleteFile(f.internal_file_path);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  } else if (status.ok() && import_options_.move_files) {
+    // The files were moved and added successfully, remove original file links
+    for (IngestedFileInfo& f : files_to_import_) {
+      const auto s = env_->DeleteFile(f.external_file_path);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "%s was added to DB successfully but failed to remove original "
+            "file link : %s",
+            f.external_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+}
+
+Status ImportColumnFamilyJob::GetIngestedFileInfo(
+    const std::string& external_file, IngestedFileInfo* file_to_import,
+    SuperVersion* sv) {
+  file_to_import->external_file_path = external_file;
+
+  // Get external file size
+  auto status = env_->GetFileSize(external_file, &file_to_import->file_size);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Create TableReader for external file
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<RandomAccessFile> sst_file;
+  std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+  status = env_->NewRandomAccessFile(external_file, &sst_file, env_options_);
+  if (!status.ok()) {
+    return status;
+  }
+  sst_file_reader.reset(
+      new RandomAccessFileReader(std::move(sst_file), external_file));
+
+  status = cfd_->ioptions()->table_factory->NewTableReader(
+      TableReaderOptions(*cfd_->ioptions(),
+                         sv->mutable_cf_options.prefix_extractor.get(),
+                         env_options_, cfd_->internal_comparator()),
+      std::move(sst_file_reader), file_to_import->file_size, &table_reader);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Get the external file properties
+  auto props = table_reader->GetTableProperties();
+
+  // Set original_seqno to 0.
+  file_to_import->original_seqno = 0;
+
+  // Get number of entries in table
+  file_to_import->num_entries = props->num_entries;
+
+  ParsedInternalKey key;
+  ReadOptions ro;
+  // During reading the external file we can cache blocks that we read into
+  // the block cache, if we later change the global seqno of this file, we will
+  // have block in cache that will include keys with wrong seqno.
+  // We need to disable fill_cache so that we read from the file without
+  // updating the block cache.
+  ro.fill_cache = false;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+
+  // Get first (smallest) key from file
+  iter->SeekToFirst();
+  if (!ParseInternalKey(iter->key(), &key)) {
+    return Status::Corruption("external file have corrupted keys");
+  }
+  file_to_import->smallest_internal_key.SetFrom(key);
+
+  // Get last (largest) key from file
+  iter->SeekToLast();
+  if (!ParseInternalKey(iter->key(), &key)) {
+    return Status::Corruption("external file have corrupted keys");
+  }
+  file_to_import->largest_internal_key.SetFrom(key);
+
+  file_to_import->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+  file_to_import->table_properties = *props;
+
+  return status;
+}
+
+}  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/db/import_column_family_job.h b/db/import_column_family_job.h
new file mode 100644
index 00000000000..05796590b61
--- /dev/null
+++ b/db/import_column_family_job.h
@@ -0,0 +1,70 @@
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/dbformat.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/snapshot_impl.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+// Imports a set of sst files as is into a new column family. Logic is similar
+// to ExternalSstFileIngestionJob.
+class ImportColumnFamilyJob {
+ public:
+  ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+                        const ImmutableDBOptions& db_options,
+                        const EnvOptions& env_options,
+                        const ImportColumnFamilyOptions& import_options,
+                        const std::vector<LiveFileMetaData>& metadata)
+      : env_(env),
+        versions_(versions),
+        cfd_(cfd),
+        db_options_(db_options),
+        env_options_(env_options),
+        import_options_(import_options),
+        metadata_(metadata) {}
+
+  // Prepare the job by copying external files into the DB.
+  Status Prepare(uint64_t next_file_number, SuperVersion* sv);
+
+  // Will execute the import job and prepare edit() to be applied.
+  // REQUIRES: Mutex held
+  Status Run();
+
+  // Cleanup after successful/failed job
+  void Cleanup(const Status& status);
+
+  VersionEdit* edit() { return &edit_; }
+
+  const autovector<IngestedFileInfo>& files_to_import() const {
+    return files_to_import_;
+  }
+
+ private:
+  // Open the external file and populate `file_to_import` with all the
+  // external information we need to import this file.
+  Status GetIngestedFileInfo(const std::string& external_file,
+                             IngestedFileInfo* file_to_import,
+                             SuperVersion* sv);
+
+  Env* env_;
+  VersionSet* versions_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const EnvOptions& env_options_;
+  autovector<IngestedFileInfo> files_to_import_;
+  VersionEdit edit_;
+  const ImportColumnFamilyOptions& import_options_;
+  std::vector<LiveFileMetaData> metadata_;
+};
+
+}  // namespace rocksdb
diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc
new file mode 100644
index 00000000000..1138b16bae5
--- /dev/null
+++ b/db/import_column_family_test.cc
@@ -0,0 +1,567 @@
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+
+namespace rocksdb {
+
+class ImportColumnFamilyTest : public DBTestBase {
+ public:
+  ImportColumnFamilyTest() : DBTestBase("/import_column_family_test") {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    DestroyAndRecreateExternalSSTFilesDir();
+    export_files_dir_ = test::TmpDir(env_) + "/export";
+    import_cfh_ = nullptr;
+    import_cfh2_ = nullptr;
+    metadata_ptr_ = nullptr;
+  }
+
+  ~ImportColumnFamilyTest() {
+    if (import_cfh_) {
+      db_->DropColumnFamily(import_cfh_);
+      db_->DestroyColumnFamilyHandle(import_cfh_);
+      import_cfh_ = nullptr;
+    }
+    if (import_cfh2_) {
+      db_->DropColumnFamily(import_cfh2_);
+      db_->DestroyColumnFamilyHandle(import_cfh2_);
+      import_cfh2_ = nullptr;
+    }
+    if (metadata_ptr_) {
+      delete metadata_ptr_;
+      metadata_ptr_ = nullptr;
+    }
+    test::DestroyDir(env_, sst_files_dir_);
+    test::DestroyDir(env_, export_files_dir_);
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    test::DestroyDir(env_, sst_files_dir_);
+    env_->CreateDir(sst_files_dir_);
+    test::DestroyDir(env_, export_files_dir_);
+  }
+
+  LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path,
+                                        int level,
+                                        SequenceNumber smallest_seqno,
+                                        SequenceNumber largest_seqno) {
+    LiveFileMetaData metadata;
+    metadata.name = name;
+    metadata.db_path = path;
+    metadata.smallest_seqno = smallest_seqno;
+    metadata.largest_seqno = largest_seqno;
+    metadata.level = level;
+    return metadata;
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  std::string export_files_dir_;
+  ColumnFamilyHandle* import_cfh_;
+  ColumnFamilyHandle* import_cfh2_;
+  ExportImportFilesMetaData* metadata_ptr_;
+};
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+  SstFileWriter sfw_unknown(EnvOptions(), options);
+
+  // cf1.sst
+  const std::string cf1_sst_name = "cf1.sst";
+  const std::string cf1_sst = sst_files_dir_ + cf1_sst_name;
+  ASSERT_OK(sfw_cf1.Open(cf1_sst));
+  ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+  ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // cf_unknown.sst
+  const std::string unknown_sst_name = "cf_unknown.sst";
+  const std::string unknown_sst = sst_files_dir_ + unknown_sst_name;
+  ASSERT_OK(sfw_unknown.Open(unknown_sst));
+  ASSERT_OK(sfw_unknown.Put("K3", "V1"));
+  ASSERT_OK(sfw_unknown.Put("K4", "V2"));
+  ASSERT_OK(sfw_unknown.Finish());
+
+  {
+    // Import sst file corresponding to cf1 onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, "K1", &value);
+    ASSERT_EQ(value, "V1");
+    db_->Get(ReadOptions(), import_cfh_, "K2", &value);
+    ASSERT_EQ(value, "V2");
+    ASSERT_OK(db_->DropColumnFamily(import_cfh_));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+    import_cfh_ = nullptr;
+  }
+
+  {
+    // Import sst file corresponding to unknown cf onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, "K3", &value);
+    ASSERT_EQ(value, "V1");
+    db_->Get(ReadOptions(), import_cfh_, "K4", &value);
+    ASSERT_EQ(value, "V2");
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+
+  // file3.sst
+  const std::string file3_sst_name = "file3.sst";
+  const std::string file3_sst = sst_files_dir_ + file3_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file3_sst));
+  for (int i = 0; i < 100; ++i) {
+    sfw_cf1.Put(Key(i), Key(i) + "_val");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file2.sst
+  const std::string file2_sst_name = "file2.sst";
+  const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file2_sst));
+  for (int i = 0; i < 100; i += 2) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite1");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1a.sst
+  const std::string file1a_sst_name = "file1a.sst";
+  const std::string file1a_sst = sst_files_dir_ + file1a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1a_sst));
+  for (int i = 0; i < 52; i += 4) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1b.sst
+  const std::string file1b_sst_name = "file1b.sst";
+  const std::string file1b_sst = sst_files_dir_ + file1b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1b_sst));
+  for (int i = 52; i < 100; i += 4) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite2");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0a.sst
+  const std::string file0a_sst_name = "file0a.sst";
+  const std::string file0a_sst = sst_files_dir_ + file0a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0a_sst));
+  for (int i = 0; i < 100; i += 16) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite3");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0b.sst
+  const std::string file0b_sst_name = "file0b.sst";
+  const std::string file0b_sst = sst_files_dir_ + file0b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0b_sst));
+  for (int i = 0; i < 100; i += 16) {
+    sfw_cf1.Put(Key(i), Key(i) + "_overwrite4");
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // Import sst files and verify
+  ExportImportFilesMetaData metadata;
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59));
+  metadata.db_comparator_name = options.comparator->Name();
+
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(
+      options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  for (int i = 0; i < 100; i += 5) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5"));
+  }
+
+  // Flush and check again
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  // Compact and check again.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_val");
+  }
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite");
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite2");
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  ImportColumnFamilyOptions import_options;
+  import_options.move_files = false;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options,
+                                              *metadata_ptr_, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  import_options.move_files = true;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options,
+                                              *metadata_ptr_, &import_cfh2_));
+  ASSERT_NE(import_cfh2_, nullptr);
+  delete metadata_ptr_;
+  metadata_ptr_ = NULL;
+
+  std::string value1, value2;
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Get(1, Key(i)), value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Modify keys in cf1 and verify.
+  for (int i = 0; i < 25; i++) {
+    ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i)));
+  }
+  for (int i = 25; i < 50; i++) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3"));
+  }
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Compact and check again.
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    Put(1, Key(i), Key(i) + "_val");
+  }
+  ASSERT_OK(Flush(1));
+
+  // Compact to create a L1 file.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 50; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite");
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+
+  for (int i = 0; i < 25; ++i) {
+    Put(1, Key(i), Key(i) + "_overwrite2");
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  // Create a new db and import the files.
+  DB* db_copy;
+  test::DestroyDir(env_, dbname_ + "/db_copy");
+  ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                  ImportColumnFamilyOptions(),
+                                                  *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+
+  for (int i = 0; i < 100; ++i) {
+    std::string value;
+    db_copy->Get(ReadOptions(), cfh, Key(i), &value);
+    ASSERT_EQ(Get(1, Key(i)), value);
+  }
+  db_copy->DropColumnFamily(cfh);
+  db_copy->DestroyColumnFamilyHandle(cfh);
+  delete db_copy;
+  test::DestroyDir(env_, dbname_ + "/db_copy");
+}
+
+TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  {
+    // Create column family with existing cf name.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Column family already exists"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with no files specified.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("The list of files is empty"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with overlapping keys in sst files.
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file2_sst_name = "file2.sst";
+    const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file2_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K3", "V3"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Files have overlapping ranges"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with a mismatching comparator, should fail with appropriate error.
+    ExportImportFilesMetaData metadata;
+    Options mismatch_options = CurrentOptions();
+    mismatch_options.comparator = ReverseBytewiseComparator();
+    SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = mismatch_options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Comparator name mismatch"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with non existent sst file should fail with appropriate error
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file3_sst_name = "file3.sst";
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::IOError("No such file or directory"));
+    ASSERT_EQ(import_cfh_, nullptr);
+
+    // Test successful import after a failure with the same CF name. Ensures
+    // there is no side effect with CF when there is a failed import
+    metadata.files.pop_back();
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as External SST File Writer and Import are not supported "
+          "in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 57c7427e801..94d9cd8ac5b 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -10,20 +10,16 @@
 
 #include "db/internal_stats.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
+#include <cinttypes>
 #include <limits>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
-#include "table/block_based_table_factory.h"
+#include "db/db_impl/db_impl.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -958,14 +954,17 @@ void InternalStats::DumpDBStats(std::string* value) {
            seconds_up, interval_seconds_up);
   value->append(buf);
   // Cumulative
-  uint64_t user_bytes_written = GetDBStats(InternalStats::BYTES_WRITTEN);
-  uint64_t num_keys_written = GetDBStats(InternalStats::NUMBER_KEYS_WRITTEN);
-  uint64_t write_other = GetDBStats(InternalStats::WRITE_DONE_BY_OTHER);
-  uint64_t write_self = GetDBStats(InternalStats::WRITE_DONE_BY_SELF);
-  uint64_t wal_bytes = GetDBStats(InternalStats::WAL_FILE_BYTES);
-  uint64_t wal_synced = GetDBStats(InternalStats::WAL_FILE_SYNCED);
-  uint64_t write_with_wal = GetDBStats(InternalStats::WRITE_WITH_WAL);
-  uint64_t write_stall_micros = GetDBStats(InternalStats::WRITE_STALL_MICROS);
+  uint64_t user_bytes_written =
+      GetDBStats(InternalStats::kIntStatsBytesWritten);
+  uint64_t num_keys_written =
+      GetDBStats(InternalStats::kIntStatsNumKeysWritten);
+  uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther);
+  uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf);
+  uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes);
+  uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced);
+  uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal);
+  uint64_t write_stall_micros =
+      GetDBStats(InternalStats::kIntStatsWriteStallMicros);
 
   const int kHumanMicrosLen = 32;
   char human_micros[kHumanMicrosLen];
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 20fb07f4853..24a8d98e6db 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -109,15 +109,15 @@ class InternalStats {
   };
 
   enum InternalDBStatsType {
-    WAL_FILE_BYTES,
-    WAL_FILE_SYNCED,
-    BYTES_WRITTEN,
-    NUMBER_KEYS_WRITTEN,
-    WRITE_DONE_BY_OTHER,
-    WRITE_DONE_BY_SELF,
-    WRITE_WITH_WAL,
-    WRITE_STALL_MICROS,
-    INTERNAL_DB_STATS_ENUM_MAX,
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
   };
 
   InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd)
@@ -237,6 +237,28 @@ class InternalStats {
       }
     }
 
+    CompactionStats& operator=(const CompactionStats& c) {
+      micros = c.micros;
+      cpu_micros = c.cpu_micros;
+      bytes_read_non_output_levels = c.bytes_read_non_output_levels;
+      bytes_read_output_level = c.bytes_read_output_level;
+      bytes_written = c.bytes_written;
+      bytes_moved = c.bytes_moved;
+      num_input_files_in_non_output_levels =
+          c.num_input_files_in_non_output_levels;
+      num_input_files_in_output_level = c.num_input_files_in_output_level;
+      num_output_files = c.num_output_files;
+      num_input_records = c.num_input_records;
+      num_dropped_records = c.num_dropped_records;
+      count = c.count;
+
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = c.counts[i];
+      }
+      return *this;
+    }
+
     void Clear() {
       this->micros = 0;
       this->cpu_micros = 0;
@@ -300,7 +322,7 @@ class InternalStats {
   };
 
   void Clear() {
-    for (int i = 0; i < INTERNAL_DB_STATS_ENUM_MAX; i++) {
+    for (int i = 0; i < kIntStatsNumMax; i++) {
       db_stats_[i].store(0);
     }
     for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) {
@@ -394,7 +416,7 @@ class InternalStats {
   bool HandleBlockCacheStat(Cache** block_cache);
 
   // Per-DB stats
-  std::atomic<uint64_t> db_stats_[INTERNAL_DB_STATS_ENUM_MAX];
+  std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
   // Per-ColumnFamily stats
   uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
   uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
@@ -593,15 +615,15 @@ class InternalStats {
   };
 
   enum InternalDBStatsType {
-    WAL_FILE_BYTES,
-    WAL_FILE_SYNCED,
-    BYTES_WRITTEN,
-    NUMBER_KEYS_WRITTEN,
-    WRITE_DONE_BY_OTHER,
-    WRITE_DONE_BY_SELF,
-    WRITE_WITH_WAL,
-    WRITE_STALL_MICROS,
-    INTERNAL_DB_STATS_ENUM_MAX,
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
   };
 
   InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {}
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 56968d8f803..0e8bae40785 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -3,11 +3,14 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "db/db_impl.h"
+#include "db/blob_index.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "logging/logging.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/cache.h"
@@ -21,17 +24,15 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-#include "table/block_based_table_factory.h"
-#include "table/plain_table_factory.h"
-#include "util/filename.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/plain/plain_table_factory.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/hash.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 #ifndef ROCKSDB_LITE
@@ -42,6 +43,14 @@ class EventListenerTest : public DBTestBase {
  public:
   EventListenerTest() : DBTestBase("/listener_test") {}
 
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
   const size_t k110KB = 110 << 10;
 };
 
@@ -79,11 +88,47 @@ class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
 
 class TestCompactionListener : public EventListener {
  public:
+  explicit TestCompactionListener(EventListenerTest* test) : test_(test) {}
+
   void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
     std::lock_guard<std::mutex> lock(mutex_);
     compacted_dbs_.push_back(db);
     ASSERT_GT(ci.input_files.size(), 0U);
+    ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size());
+
+    for (size_t i = 0; i < ci.input_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level);
+      ASSERT_EQ(ci.input_file_infos[i].file_number,
+                TableFileNameToNumber(ci.input_files[i]));
+    }
+
     ASSERT_GT(ci.output_files.size(), 0U);
+    ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size());
+
+    ASSERT_TRUE(test_);
+    ASSERT_EQ(test_->db_, db);
+
+    std::vector<std::vector<FileMetaData>> files_by_level;
+    test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id],
+                                           &files_by_level);
+    ASSERT_GT(files_by_level.size(), ci.output_level);
+
+    for (size_t i = 0; i < ci.output_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level);
+      ASSERT_EQ(ci.output_file_infos[i].file_number,
+                TableFileNameToNumber(ci.output_files[i]));
+
+      auto it = std::find_if(
+          files_by_level[ci.output_level].begin(),
+          files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) {
+            return meta.fd.GetNumber() == ci.output_file_infos[i].file_number;
+          });
+      ASSERT_NE(it, files_by_level[ci.output_level].end());
+
+      ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number,
+                it->oldest_blob_file_number);
+    }
+
     ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
     ASSERT_GT(ci.thread_id, 0U);
 
@@ -98,6 +143,7 @@ class TestCompactionListener : public EventListener {
     }
   }
 
+  EventListenerTest* test_;
   std::vector<DB*> compacted_dbs_;
   std::mutex mutex_;
 };
@@ -125,13 +171,19 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   options.table_properties_collector_factories.push_back(
       std::make_shared<TestPropertiesCollectorFactory>());
 
-  TestCompactionListener* listener = new TestCompactionListener();
+  TestCompactionListener* listener = new TestCompactionListener(this);
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {
       "pikachu", "ilya", "muromec", "dobrynia",
       "nikitich", "alyosha", "popovich"};
   CreateAndReopenWithCF(cf_names, options);
   ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(123, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
   ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
   ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
   ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
@@ -140,11 +192,9 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (int i = 1; i < 8; ++i) {
     ASSERT_OK(Flush(i));
-    const Slice kRangeStart = "a";
-    const Slice kRangeEnd = "z";
-    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
-                                     &kRangeStart, &kRangeEnd));
     dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
+                                     nullptr, nullptr));
     dbfull()->TEST_WaitForCompact();
   }
 
@@ -157,8 +207,8 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
 // This simple Listener can only handle one flush at a time.
 class TestFlushListener : public EventListener {
  public:
-  explicit TestFlushListener(Env* env)
-      : slowdown_count(0), stop_count(0), db_closed(), env_(env) {
+  TestFlushListener(Env* env, EventListenerTest* test)
+      : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
     db_closed = false;
   }
   void OnTableFileCreated(
@@ -210,6 +260,27 @@ class TestFlushListener : public EventListener {
     ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
     ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
     ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+    ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+    // Note: the following chunk relies on the notification pertaining to the
+    // database pointed to by DBTestBase::db_, and is thus bypassed when
+    // that assumption does not hold (see the test case MultiDBMultiListeners
+    // below).
+    ASSERT_TRUE(test_);
+    if (db == test_->db_) {
+      std::vector<std::vector<FileMetaData>> files_by_level;
+      test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id],
+                                             &files_by_level);
+
+      ASSERT_FALSE(files_by_level.empty());
+      auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+                             [&](const FileMetaData& meta) {
+                               return meta.fd.GetNumber() == info.file_number;
+                             });
+      ASSERT_NE(it, files_by_level[0].end());
+      ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+    }
+
     ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
     ASSERT_GT(info.thread_id, 0U);
     ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second,
@@ -226,6 +297,7 @@ class TestFlushListener : public EventListener {
 
  protected:
   Env* env_;
+  EventListenerTest* test_;
 };
 
 TEST_F(EventListenerTest, OnSingleDBFlushTest) {
@@ -235,7 +307,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
 #ifdef ROCKSDB_USING_THREAD_STATUS
   options.enable_thread_tracking = true;
 #endif  // ROCKSDB_USING_THREAD_STATUS
-  TestFlushListener* listener = new TestFlushListener(options.env);
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {
       "pikachu", "ilya", "muromec", "dobrynia",
@@ -245,6 +317,12 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
   CreateAndReopenWithCF(cf_names, options);
 
   ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(456, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
   ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
   ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
   ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
@@ -272,7 +350,7 @@ TEST_F(EventListenerTest, MultiCF) {
 #ifdef ROCKSDB_USING_THREAD_STATUS
   options.enable_thread_tracking = true;
 #endif  // ROCKSDB_USING_THREAD_STATUS
-  TestFlushListener* listener = new TestFlushListener(options.env);
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
   options.listeners.emplace_back(listener);
   options.table_properties_collector_factories.push_back(
       std::make_shared<TestPropertiesCollectorFactory>());
@@ -313,7 +391,7 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
   const int kNumDBs = 5;
   const int kNumListeners = 10;
   for (int i = 0; i < kNumListeners; ++i) {
-    listeners.emplace_back(new TestFlushListener(options.env));
+    listeners.emplace_back(new TestFlushListener(options.env, this));
   }
 
   std::vector<std::string> cf_names = {
@@ -390,7 +468,7 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
 #ifdef ROCKSDB_USING_THREAD_STATUS
   options.enable_thread_tracking = true;
 #endif  // ROCKSDB_USING_THREAD_STATUS
-  TestFlushListener* listener = new TestFlushListener(options.env);
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
   const int kCompactionTrigger = 1;
   const int kSlowdownTrigger = 5;
   const int kStopTrigger = 100;
diff --git a/db/log_reader.cc b/db/log_reader.cc
index e734e9d6c88..3a71cbc4291 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -10,10 +10,11 @@
 #include "db/log_reader.h"
 
 #include <stdio.h>
+#include "file/sequence_file_reader.h"
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/util.h"
 
 namespace rocksdb {
diff --git a/db/log_reader.h b/db/log_reader.h
index bda9ac8bb35..5f9cb981dba 100644
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -12,13 +12,12 @@
 #include <stdint.h>
 
 #include "db/log_format.h"
+#include "file/sequence_file_reader.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
-#include "rocksdb/options.h"
 
 namespace rocksdb {
-
-class SequentialFileReader;
 class Logger;
 
 namespace log {
@@ -53,6 +52,9 @@ class Reader {
          // @lint-ignore TXT2 T25377293 Grandfathered in
          std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
          bool checksum, uint64_t log_num);
+  // No copying allowed
+  Reader(const Reader&) = delete;
+  void operator=(const Reader&) = delete;
 
   virtual ~Reader();
 
@@ -148,11 +150,6 @@ class Reader {
   // buffer_ must be updated to remove the dropped bytes prior to invocation.
   void ReportCorruption(size_t bytes, const char* reason);
   void ReportDrop(size_t bytes, const Status& reason);
-
- private:
-  // No copying allowed
-  Reader(const Reader&);
-  void operator=(const Reader&);
 };
 
 class FragmentBufferedReader : public Reader {
diff --git a/db/log_test.cc b/db/log_test.cc
index fd237b030e7..ecfae3e2db3 100644
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -9,13 +9,14 @@
 
 #include "db/log_reader.h"
 #include "db/log_writer.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 namespace log {
diff --git a/db/log_writer.cc b/db/log_writer.cc
index 6ee39198184..53efc6c15b3 100644
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -10,10 +10,10 @@
 #include "db/log_writer.h"
 
 #include <stdint.h>
+#include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 namespace log {
@@ -102,6 +102,13 @@ Status Writer::AddRecord(const Slice& slice) {
     left -= fragment_length;
     begin = false;
   } while (s.ok() && left > 0);
+
+  if (s.ok()) {
+    if (!manual_flush_) {
+      s = dest_->Flush();
+    }
+  }
+
   return s;
 }
 
@@ -146,11 +153,6 @@ Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
   Status s = dest_->Append(Slice(buf, header_size));
   if (s.ok()) {
     s = dest_->Append(Slice(ptr, n));
-    if (s.ok()) {
-      if (!manual_flush_) {
-        s = dest_->Flush();
-      }
-    }
   }
   block_offset_ += header_size + n;
   return s;
diff --git a/db/log_writer.h b/db/log_writer.h
index 116d033584a..e5ed71a764d 100644
--- a/db/log_writer.h
+++ b/db/log_writer.h
@@ -73,6 +73,10 @@ class Writer {
   explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
                   uint64_t log_number, bool recycle_log_files,
                   bool manual_flush = false);
+  // No copying allowed
+  Writer(const Writer&) = delete;
+  void operator=(const Writer&) = delete;
+
   ~Writer();
 
   Status AddRecord(const Slice& slice);
@@ -104,10 +108,6 @@ class Writer {
   // If true, it does not flush after each write. Instead it relies on the upper
   // layer to manually does the flush by calling ::WriteBuffer()
   bool manual_flush_;
-
-  // No copying allowed
-  Writer(const Writer&);
-  void operator=(const Writer&);
 };
 
 }  // namespace log
diff --git a/db/lookup_key.h b/db/lookup_key.h
index ddf4ff0e942..1b0f6f56290 100644
--- a/db/lookup_key.h
+++ b/db/lookup_key.h
@@ -21,7 +21,8 @@ class LookupKey {
  public:
   // Initialize *this for looking up user_key at a snapshot with
   // the specified sequence number.
-  LookupKey(const Slice& _user_key, SequenceNumber sequence);
+  LookupKey(const Slice& _user_key, SequenceNumber sequence,
+            const Slice* ts = nullptr);
 
   ~LookupKey();
 
diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc
index bcee5c3fbfe..1dfe0d55b43 100644
--- a/db/malloc_stats.cc
+++ b/db/malloc_stats.cc
@@ -20,10 +20,6 @@ namespace rocksdb {
 
 #ifdef ROCKSDB_JEMALLOC
 
-#ifdef JEMALLOC_NO_RENAME
-#define malloc_stats_print je_malloc_stats_print
-#endif
-
 typedef struct {
   char* cur;
   char* end;
diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc
index 02732a55583..1a69a89dea0 100644
--- a/db/manual_compaction_test.cc
+++ b/db/manual_compaction_test.cc
@@ -8,12 +8,12 @@
 #include <sstream>
 #include <cstdlib>
 
-#include "rocksdb/db.h"
+#include "port/port.h"
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
-#include "util/testharness.h"
-#include "port/port.h"
+#include "test_util/testharness.h"
 
 using namespace rocksdb;
 
diff --git a/db/memtable.cc b/db/memtable.cc
index 0c706115de0..e3c531c316e 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -10,15 +10,17 @@
 #include "db/memtable.h"
 
 #include <algorithm>
+#include <array>
 #include <limits>
 #include <memory>
-
 #include "db/dbformat.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/read_callback.h"
+#include "memory/arena.h"
+#include "memory/memory_usage.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "port/port.h"
@@ -31,10 +33,8 @@
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
 #include "table/merging_iterator.h"
-#include "util/arena.h"
 #include "util/autovector.h"
 #include "util/coding.h"
-#include "util/memory_usage.h"
 #include "util/mutexlock.h"
 #include "util/util.h"
 
@@ -105,7 +105,8 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       insert_with_hint_prefix_extractor_(
           ioptions.memtable_insert_with_hint_prefix_extractor),
       oldest_key_time_(std::numeric_limits<uint64_t>::max()),
-      atomic_flush_seqno_(kMaxSequenceNumber) {
+      atomic_flush_seqno_(kMaxSequenceNumber),
+      approximate_memory_usage_(0) {
   UpdateFlushState();
   // something went wrong if we need to flush before inserting anything
   assert(!ShouldScheduleFlush());
@@ -115,7 +116,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       moptions_.memtable_prefix_bloom_bits > 0) {
     bloom_filter_.reset(
         new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
-                         ioptions.bloom_locality, 6 /* hard coded 6 probes */,
+                         6 /* hard coded 6 probes */,
                          moptions_.memtable_huge_page_size, ioptions.info_log));
   }
 }
@@ -139,11 +140,12 @@ size_t MemTable::ApproximateMemoryUsage() {
     }
     total_usage += usage;
   }
+  approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
   // otherwise, return the actual usage
   return total_usage;
 }
 
-bool MemTable::ShouldFlushNow() const {
+bool MemTable::ShouldFlushNow() {
   size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
   // In a lot of times, we cannot allocate arena blocks that exactly matches the
   // buffer size. Thus we have to decide if we should over-allocate or
@@ -159,6 +161,8 @@ bool MemTable::ShouldFlushNow() const {
                           range_del_table_->ApproximateMemoryUsage() +
                           arena_.MemoryAllocatedBytes();
 
+  approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
+
   // if we can still allocate one more block without exceeding the
   // over-allocation ratio, then we should not flush.
   if (allocated_memory + kArenaBlockSize <
@@ -291,6 +295,9 @@ class MemTableIterator : public InternalIterator {
       iter_ = mem.table_->GetIterator(arena);
     }
   }
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&) = delete;
+  void operator=(const MemTableIterator&) = delete;
 
   ~MemTableIterator() override {
 #ifndef NDEBUG
@@ -404,10 +411,6 @@ class MemTableIterator : public InternalIterator {
   bool valid_;
   bool arena_mode_;
   bool value_pinned_;
-
-  // No copying allowed
-  MemTableIterator(const MemTableIterator&);
-  void operator=(const MemTableIterator&);
 };
 
 InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
@@ -439,7 +442,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
-  return &locks_[static_cast<size_t>(GetSliceNPHash64(key)) % locks_.size()];
+  return &locks_[fastrange64(GetSliceNPHash64(key), locks_.size())];
 }
 
 MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
@@ -466,7 +469,7 @@ MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
 bool MemTable::Add(SequenceNumber s, ValueType type,
                    const Slice& key, /* user key */
                    const Slice& value, bool allow_concurrent,
-                   MemTablePostProcessInfo* post_process_info) {
+                   MemTablePostProcessInfo* post_process_info, void** hint) {
   // Format of an entry is concatenation of:
   //  key_size     : varint32 of internal_key.size()
   //  key bytes    : char[internal_key.size()]
@@ -493,6 +496,8 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
   p = EncodeVarint32(p, val_size);
   memcpy(p, value.data(), val_size);
   assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+
   if (!allow_concurrent) {
     // Extract prefix for insert with hint.
     if (insert_with_hint_prefix_extractor_ != nullptr &&
@@ -525,7 +530,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
       bloom_filter_->Add(prefix_extractor_->Transform(key));
     }
     if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->Add(key);
+      bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz));
     }
 
     // The first sequence number inserted into the memtable
@@ -542,7 +547,9 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
     assert(post_process_info == nullptr);
     UpdateFlushState();
   } else {
-    bool res = table->InsertKeyConcurrently(handle);
+    bool res = (hint == nullptr)
+                   ? table->InsertKeyConcurrently(handle)
+                   : table->InsertKeyWithHintConcurrently(handle, hint);
     if (UNLIKELY(!res)) {
       return res;
     }
@@ -559,7 +566,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
       bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key));
     }
     if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->AddConcurrently(key);
+      bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz));
     }
 
     // atomically update first_seqno_ and earliest_seqno_.
@@ -599,6 +606,7 @@ struct Saver {
   Logger* logger;
   Statistics* statistics;
   bool inplace_update_support;
+  bool do_merge;
   Env* env_;
   ReadCallback* callback_;
   bool* is_blob_index;
@@ -625,15 +633,17 @@ static bool SaveValue(void* arg, const char* entry) {
   //    klength  varint32
   //    userkey  char[klength-8]
   //    tag      uint64
-  //    vlength  varint32
+  //    vlength  varint32f
   //    value    char[vlength]
   // Check that it belongs to same user key.  We do not check the
   // sequence number since the Seek() call above should have skipped
   // all entries with overly large sequence numbers.
   uint32_t key_length;
   const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
-  if (s->mem->GetInternalKeyComparator().user_comparator()->Equal(
-          Slice(key_ptr, key_length - 8), s->key->user_key())) {
+  Slice user_key_slice = Slice(key_ptr, key_length - 8);
+  if (s->mem->GetInternalKeyComparator()
+          .user_comparator()
+          ->CompareWithoutTimestamp(user_key_slice, s->key->user_key()) == 0) {
     // Correct user key
     const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
     ValueType type;
@@ -673,12 +683,24 @@ static bool SaveValue(void* arg, const char* entry) {
         Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
         *(s->status) = Status::OK();
         if (*(s->merge_in_progress)) {
-          if (s->value != nullptr) {
-            *(s->status) = MergeHelper::TimedFullMerge(
-                merge_operator, s->key->user_key(), &v,
-                merge_context->GetOperands(), s->value, s->logger,
-                s->statistics, s->env_, nullptr /* result_operand */, true);
+          if (s->do_merge) {
+            if (s->value != nullptr) {
+              *(s->status) = MergeHelper::TimedFullMerge(
+                  merge_operator, s->key->user_key(), &v,
+                  merge_context->GetOperands(), s->value, s->logger,
+                  s->statistics, s->env_, nullptr /* result_operand */, true);
+            }
+          } else {
+            // Preserve the value with the goal of returning it as part of
+            // raw merge operands to the user
+            merge_context->PushOperand(
+                v, s->inplace_update_support == false /* operand_pinned */);
           }
+        } else if (!s->do_merge) {
+          // Preserve the value with the goal of returning it as part of
+          // raw merge operands to the user
+          merge_context->PushOperand(
+              v, s->inplace_update_support == false /* operand_pinned */);
         } else if (s->value != nullptr) {
           s->value->assign(v.data(), v.size());
         }
@@ -722,7 +744,8 @@ static bool SaveValue(void* arg, const char* entry) {
         *(s->merge_in_progress) = true;
         merge_context->PushOperand(
             v, s->inplace_update_support == false /* operand_pinned */);
-        if (merge_operator->ShouldMerge(merge_context->GetOperandsDirectionBackward())) {
+        if (s->do_merge && merge_operator->ShouldMerge(
+                               merge_context->GetOperandsDirectionBackward())) {
           *(s->status) = MergeHelper::TimedFullMerge(
               merge_operator, s->key->user_key(), nullptr,
               merge_context->GetOperands(), s->value, s->logger, s->statistics,
@@ -746,7 +769,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext* merge_context,
                    SequenceNumber* max_covering_tombstone_seq,
                    SequenceNumber* seq, const ReadOptions& read_opts,
-                   ReadCallback* callback, bool* is_blob_index) {
+                   ReadCallback* callback, bool* is_blob_index, bool do_merge) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -767,11 +790,13 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   bool found_final_value = false;
   bool merge_in_progress = s->IsMergeInProgress();
   bool may_contain = true;
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
   if (bloom_filter_) {
     // when both memtable_whole_key_filtering and prefix_extractor_ are set,
     // only do whole key filtering for Get() to save CPU
     if (moptions_.memtable_whole_key_filtering) {
-      may_contain = bloom_filter_->MayContain(user_key);
+      may_contain =
+          bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz));
     } else {
       assert(prefix_extractor_);
       may_contain =
@@ -779,6 +804,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
           bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
     }
   }
+
   if (bloom_filter_ && !may_contain) {
     // iter is null if prefix bloom says the key does not exist
     PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
@@ -787,26 +813,9 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     if (bloom_filter_) {
       PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
     }
-    Saver saver;
-    saver.status = s;
-    saver.found_final_value = &found_final_value;
-    saver.merge_in_progress = &merge_in_progress;
-    saver.key = &key;
-    saver.value = value;
-    saver.seq = kMaxSequenceNumber;
-    saver.mem = this;
-    saver.merge_context = merge_context;
-    saver.max_covering_tombstone_seq = *max_covering_tombstone_seq;
-    saver.merge_operator = moptions_.merge_operator;
-    saver.logger = moptions_.info_log;
-    saver.inplace_update_support = moptions_.inplace_update_support;
-    saver.statistics = moptions_.statistics;
-    saver.env_ = env_;
-    saver.callback_ = callback;
-    saver.is_blob_index = is_blob_index;
-    table_->Get(key, &saver, SaveValue);
-
-    *seq = saver.seq;
+    GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
+                 is_blob_index, value, s, merge_context, seq,
+                 &found_final_value, &merge_in_progress);
   }
 
   // No change to value, since we have not yet found a Put/Delete
@@ -817,6 +826,103 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   return found_final_value;
 }
 
+void MemTable::GetFromTable(const LookupKey& key,
+                            SequenceNumber max_covering_tombstone_seq,
+                            bool do_merge, ReadCallback* callback,
+                            bool* is_blob_index, std::string* value, Status* s,
+                            MergeContext* merge_context, SequenceNumber* seq,
+                            bool* found_final_value, bool* merge_in_progress) {
+  Saver saver;
+  saver.status = s;
+  saver.found_final_value = found_final_value;
+  saver.merge_in_progress = merge_in_progress;
+  saver.key = &key;
+  saver.value = value;
+  saver.seq = kMaxSequenceNumber;
+  saver.mem = this;
+  saver.merge_context = merge_context;
+  saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
+  saver.merge_operator = moptions_.merge_operator;
+  saver.logger = moptions_.info_log;
+  saver.inplace_update_support = moptions_.inplace_update_support;
+  saver.statistics = moptions_.statistics;
+  saver.env_ = env_;
+  saver.callback_ = callback;
+  saver.is_blob_index = is_blob_index;
+  saver.do_merge = do_merge;
+  table_->Get(key, &saver, SaveValue);
+  *seq = saver.seq;
+}
+
+void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                        ReadCallback* callback, bool* is_blob) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);
+
+  MultiGetRange temp_range(*range, range->begin(), range->end());
+  if (bloom_filter_) {
+    std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+    autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
+    int num_keys = 0;
+    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+      if (!prefix_extractor_) {
+        keys[num_keys++] = &iter->ukey;
+      } else if (prefix_extractor_->InDomain(iter->ukey)) {
+        prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey));
+        keys[num_keys++] = &prefixes.back();
+      }
+    }
+    bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
+    int idx = 0;
+    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+      if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+        continue;
+      }
+      if (!may_match[idx]) {
+        temp_range.SkipKey(iter);
+        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+      } else {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+      }
+      idx++;
+    }
+  }
+  for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+    SequenceNumber seq = kMaxSequenceNumber;
+    bool found_final_value{false};
+    bool merge_in_progress = iter->s->IsMergeInProgress();
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        NewRangeTombstoneIterator(
+            read_options, GetInternalKeySeqno(iter->lkey->internal_key())));
+    if (range_del_iter != nullptr) {
+      iter->max_covering_tombstone_seq = std::max(
+          iter->max_covering_tombstone_seq,
+          range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
+    }
+    GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
+                 callback, is_blob, iter->value->GetSelf(), iter->s,
+                 &(iter->merge_context), &seq, &found_final_value,
+                 &merge_in_progress);
+
+    if (!found_final_value && merge_in_progress) {
+      *(iter->s) = Status::MergeInProgress();
+    }
+
+    if (found_final_value) {
+      iter->value->PinSelf();
+      range->MarkKeyDone(iter);
+      RecordTick(moptions_.statistics, MEMTABLE_HIT);
+    }
+  }
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+}
+
 void MemTable::Update(SequenceNumber seq,
                       const Slice& key,
                       const Slice& value) {
diff --git a/db/memtable.h b/db/memtable.h
index 709e2061e5b..0aeadce80c8 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -19,18 +19,20 @@
 #include "db/range_tombstone_fragmenter.h"
 #include "db/read_callback.h"
 #include "db/version_edit.h"
+#include "memory/allocator.h"
+#include "memory/concurrent_arena.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
-#include "util/allocator.h"
-#include "util/concurrent_arena.h"
+#include "table/multiget_context.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
 
 namespace rocksdb {
 
+struct FlushJobInfo;
 class Mutex;
 class MemTableIterator;
 class MergeContext;
@@ -63,6 +65,7 @@ struct MemTablePostProcessInfo {
   uint64_t num_deletes = 0;
 };
 
+using MultiGetRange = MultiGetContext::Range;
 // Note:  Many of the methods in this class have comments indicating that
 // external synchronization is required as these methods are not thread-safe.
 // It is up to higher layers of code to decide how to prevent concurrent
@@ -101,6 +104,9 @@ class MemTable {
                     const MutableCFOptions& mutable_cf_options,
                     WriteBufferManager* write_buffer_manager,
                     SequenceNumber earliest_seq, uint32_t column_family_id);
+  // No copying allowed
+  MemTable(const MemTable&) = delete;
+  MemTable& operator=(const MemTable&) = delete;
 
   // Do not delete this MemTable unless Unref() indicates it not in use.
   ~MemTable();
@@ -130,6 +136,12 @@ class MemTable {
   // operations on the same MemTable (unless this Memtable is immutable).
   size_t ApproximateMemoryUsage();
 
+  // As a cheap version of `ApproximateMemoryUsage()`, this function doens't
+  // require external synchronization. The value may be less accurate though
+  size_t ApproximateMemoryUsageFast() {
+    return approximate_memory_usage_.load(std::memory_order_relaxed);
+  }
+
   // This method heuristically determines if the memtable should continue to
   // host more data.
   bool ShouldScheduleFlush() const {
@@ -173,8 +185,13 @@ class MemTable {
   // the <key, seq> already exists.
   bool Add(SequenceNumber seq, ValueType type, const Slice& key,
            const Slice& value, bool allow_concurrent = false,
-           MemTablePostProcessInfo* post_process_info = nullptr);
+           MemTablePostProcessInfo* post_process_info = nullptr,
+           void** hint = nullptr);
 
+  // Used to Get value associated with key or Get Merge Operands associated
+  // with key.
+  // If do_merge = true the default behavior which is Get value for key is
+  // executed. Expected behavior is described right below.
   // If memtable contains a value for key, store it in *value and return true.
   // If memtable contains a deletion for key, store a NotFound() error
   // in *status and return true.
@@ -188,22 +205,28 @@ class MemTable {
   // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
   // On success, *s may be set to OK, NotFound, or MergeInProgress.  Any other
   // status returned indicates a corruption or other unexpected error.
+  // If do_merge = false then any Merge Operands encountered for key are simply
+  // stored in merge_context.operands_list and never actually merged to get a
+  // final value. The raw Merge Operands are eventually returned to the user.
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
-           bool* is_blob_index = nullptr);
+           bool* is_blob_index = nullptr, bool do_merge = true);
 
   bool Get(const LookupKey& key, std::string* value, Status* s,
            MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq,
            const ReadOptions& read_opts, ReadCallback* callback = nullptr,
-           bool* is_blob_index = nullptr) {
+           bool* is_blob_index = nullptr, bool do_merge = true) {
     SequenceNumber seq;
     return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
-               read_opts, callback, is_blob_index);
+               read_opts, callback, is_blob_index, do_merge);
   }
 
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool* is_blob);
+
   // Attempts to update the new_value inplace, else does normal Add
   // Pseudocode
   //   if key exists in current memtable && prev_value is of type kTypeValue
@@ -401,6 +424,16 @@ class MemTable {
     flush_in_progress_ = in_progress;
   }
 
+#ifndef ROCKSDB_LITE
+  void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
+    flush_job_info_ = std::move(info);
+  }
+
+  std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
+    return std::move(flush_job_info_);
+  }
+#endif  // !ROCKSDB_LITE
+
  private:
   enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
 
@@ -479,17 +512,29 @@ class MemTable {
   // writes with sequence number smaller than seq are flushed.
   SequenceNumber atomic_flush_seqno_;
 
+  // keep track of memory usage in table_, arena_, and range_del_table_.
+  // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+  std::atomic<uint64_t> approximate_memory_usage_;
+
+#ifndef ROCKSDB_LITE
+  // Flush job info of the current memtable.
+  std::unique_ptr<FlushJobInfo> flush_job_info_;
+#endif  // !ROCKSDB_LITE
+
   // Returns a heuristic flush decision
-  bool ShouldFlushNow() const;
+  bool ShouldFlushNow();
 
   // Updates flush_state_ using ShouldFlushNow()
   void UpdateFlushState();
 
   void UpdateOldestKeyTime();
 
-  // No copying allowed
-  MemTable(const MemTable&);
-  MemTable& operator=(const MemTable&);
+  void GetFromTable(const LookupKey& key,
+                    SequenceNumber max_covering_tombstone_seq, bool do_merge,
+                    ReadCallback* callback, bool* is_blob_index,
+                    std::string* value, Status* s, MergeContext* merge_context,
+                    SequenceNumber* seq, bool* found_final_value,
+                    bool* merge_in_progress);
 };
 
 extern const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 5abe59b3632..d9159b7937f 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -5,26 +5,22 @@
 //
 #include "db/memtable_list.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <limits>
 #include <queue>
 #include <string>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/memtable.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
+#include "logging/log_buffer.h"
 #include "monitoring/thread_status_util.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/log_buffer.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -50,6 +46,8 @@ MemTableListVersion::MemTableListVersion(
     size_t* parent_memtable_list_memory_usage, MemTableListVersion* old)
     : max_write_buffer_number_to_maintain_(
           old->max_write_buffer_number_to_maintain_),
+      max_write_buffer_size_to_maintain_(
+          old->max_write_buffer_size_to_maintain_),
       parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
   if (old != nullptr) {
     memlist_ = old->memlist_;
@@ -66,8 +64,10 @@ MemTableListVersion::MemTableListVersion(
 
 MemTableListVersion::MemTableListVersion(
     size_t* parent_memtable_list_memory_usage,
-    int max_write_buffer_number_to_maintain)
+    int max_write_buffer_number_to_maintain,
+    int64_t max_write_buffer_size_to_maintain)
     : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
       parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
 
 void MemTableListVersion::Ref() { ++refs_; }
@@ -113,6 +113,31 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
                      is_blob_index);
 }
 
+void MemTableListVersion::MultiGet(const ReadOptions& read_options,
+                                   MultiGetRange* range, ReadCallback* callback,
+                                   bool* is_blob) {
+  for (auto memtable : memlist_) {
+    memtable->MultiGet(read_options, range, callback, is_blob);
+    if (range->empty()) {
+      return;
+    }
+  }
+}
+
+bool MemTableListVersion::GetMergeOperands(
+    const LookupKey& key, Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
+  for (MemTable* memtable : memlist_) {
+    bool done = memtable->Get(key, nullptr, s, merge_context,
+                              max_covering_tombstone_seq, read_opts, nullptr,
+                              nullptr, false);
+    if (done) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool MemTableListVersion::GetFromHistory(
     const LookupKey& key, std::string* value, Status* s,
     MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
@@ -230,7 +255,7 @@ void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
   assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
   AddMemTable(m);
 
-  TrimHistory(to_delete);
+  TrimHistory(to_delete, m->ApproximateMemoryUsage());
 }
 
 // Removes m from list of memtables not flushed.  Caller should NOT Unref m.
@@ -240,19 +265,51 @@ void MemTableListVersion::Remove(MemTable* m,
   memlist_.remove(m);
 
   m->MarkFlushed();
-  if (max_write_buffer_number_to_maintain_ > 0) {
+  if (max_write_buffer_size_to_maintain_ > 0 ||
+      max_write_buffer_number_to_maintain_ > 0) {
     memlist_history_.push_front(m);
-    TrimHistory(to_delete);
+    // Unable to get size of mutable memtable at this point, pass 0 to
+    // TrimHistory as a best effort.
+    TrimHistory(to_delete, 0);
   } else {
     UnrefMemTable(to_delete, m);
   }
 }
 
+// return the total memory usage assuming the oldest flushed memtable is dropped
+size_t MemTableListVersion::ApproximateMemoryUsageExcludingLast() {
+  size_t total_memtable_size = 0;
+  for (auto& memtable : memlist_) {
+    total_memtable_size += memtable->ApproximateMemoryUsage();
+  }
+  for (auto& memtable : memlist_history_) {
+    total_memtable_size += memtable->ApproximateMemoryUsage();
+  }
+  if (!memlist_history_.empty()) {
+    total_memtable_size -= memlist_history_.back()->ApproximateMemoryUsage();
+  }
+  return total_memtable_size;
+}
+
+bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
+  if (max_write_buffer_size_to_maintain_ > 0) {
+    // calculate the total memory usage after dropping the oldest flushed
+    // memtable, compare with max_write_buffer_size_to_maintain_ to decide
+    // whether to trim history
+    return ApproximateMemoryUsageExcludingLast() + usage >=
+           static_cast<size_t>(max_write_buffer_size_to_maintain_);
+  } else if (max_write_buffer_number_to_maintain_ > 0) {
+    return memlist_.size() + memlist_history_.size() >
+           static_cast<size_t>(max_write_buffer_number_to_maintain_);
+  } else {
+    return false;
+  }
+}
+
 // Make sure we don't use up too much space in history
-void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete) {
-  while (memlist_.size() + memlist_history_.size() >
-             static_cast<size_t>(max_write_buffer_number_to_maintain_) &&
-         !memlist_history_.empty()) {
+void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
+                                      size_t usage) {
+  while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) {
     MemTable* x = memlist_history_.back();
     memlist_history_.pop_back();
 
@@ -277,8 +334,12 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
   const auto& memlist = current_->memlist_;
+  bool atomic_flush = false;
   for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
     MemTable* m = *it;
+    if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) {
+      atomic_flush = true;
+    }
     if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) {
       break;
     }
@@ -292,7 +353,9 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
       ret->push_back(m);
     }
   }
-  flush_requested_ = false;  // start-flush request is complete
+  if (!atomic_flush || num_flush_not_started_ == 0) {
+    flush_requested_ = false;  // start-flush request is complete
+  }
 }
 
 void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
@@ -322,7 +385,8 @@ Status MemTableList::TryInstallMemtableFlushResults(
     const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
     VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
     autovector<MemTable*>* to_delete, Directory* db_directory,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer,
+    std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
   mu->AssertHeld();
@@ -380,6 +444,14 @@ Status MemTableList::TryInstallMemtableFlushResults(
                          cfd->GetName().c_str(), m->file_number_);
         edit_list.push_back(&m->edit_);
         memtables_to_flush.push_back(m);
+#ifndef ROCKSDB_LITE
+        std::unique_ptr<FlushJobInfo> info = m->ReleaseFlushJobInfo();
+        if (info != nullptr) {
+          committed_flush_jobs_info->push_back(std::move(info));
+        }
+#else
+        (void)committed_flush_jobs_info;
+#endif  // !ROCKSDB_LITE
       }
       batch_count++;
     }
@@ -428,10 +500,12 @@ Status MemTableList::TryInstallMemtableFlushResults(
                            cfd->GetName().c_str(), m->file_number_, mem_id);
           assert(m->file_number_ > 0);
           current_->Remove(m, to_delete);
+          UpdateMemoryUsageExcludingLast();
+          ResetTrimHistoryNeeded();
           ++mem_id;
         }
       } else {
-        for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; it++) {
+        for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
           MemTable* m = *it;
           // commit failed. setup state so that we can flush again.
           ROCKS_LOG_BUFFER(log_buffer, "Level-0 commit table #%" PRIu64
@@ -467,6 +541,15 @@ void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
   if (num_flush_not_started_ == 1) {
     imm_flush_needed.store(true, std::memory_order_release);
   }
+  UpdateMemoryUsageExcludingLast();
+  ResetTrimHistoryNeeded();
+}
+
+void MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
+  InstallNewVersion();
+  current_->TrimHistory(to_delete, usage);
+  UpdateMemoryUsageExcludingLast();
+  ResetTrimHistoryNeeded();
 }
 
 // Returns an estimate of the number of bytes of data in use.
@@ -480,6 +563,20 @@ size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
 
 size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
 
+size_t MemTableList::ApproximateMemoryUsageExcludingLast() {
+  size_t usage =
+      current_memory_usage_excluding_last_.load(std::memory_order_relaxed);
+  return usage;
+}
+
+// Update current_memory_usage_excluding_last_, need to call whenever state
+// changes for MemtableListVersion (whenever InstallNewVersion() is called)
+void MemTableList::UpdateMemoryUsageExcludingLast() {
+  size_t total_memtable_size = current_->ApproximateMemoryUsageExcludingLast();
+  current_memory_usage_excluding_last_.store(total_memtable_size,
+                                             std::memory_order_relaxed);
+}
+
 uint64_t MemTableList::ApproximateOldestKeyTime() const {
   if (!current_->memlist_.empty()) {
     return current_->memlist_.back()->ApproximateOldestKeyTime();
@@ -592,7 +689,7 @@ Status InstallMemtableAtomicFlushResults(
     imm->InstallNewVersion();
   }
 
-  if (s.ok() || s.IsShutdownInProgress()) {
+  if (s.ok() || s.IsColumnFamilyDropped()) {
     for (size_t i = 0; i != cfds.size(); ++i) {
       if (cfds[i]->IsDropped()) {
         continue;
@@ -607,6 +704,8 @@ Status InstallMemtableAtomicFlushResults(
                          cfds[i]->GetName().c_str(), m->GetFileNumber(),
                          mem_id);
         imm->current_->Remove(m, to_delete);
+        imm->UpdateMemoryUsageExcludingLast();
+        imm->ResetTrimHistoryNeeded();
       }
     }
   } else {
@@ -632,4 +731,31 @@ Status InstallMemtableAtomicFlushResults(
   return s;
 }
 
+void MemTableList::RemoveOldMemTables(uint64_t log_number,
+                                      autovector<MemTable*>* to_delete) {
+  assert(to_delete != nullptr);
+  InstallNewVersion();
+  auto& memlist = current_->memlist_;
+  autovector<MemTable*> old_memtables;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* mem = *it;
+    if (mem->GetNextLogNumber() > log_number) {
+      break;
+    }
+    old_memtables.push_back(mem);
+  }
+
+  for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) {
+    MemTable* mem = *it;
+    current_->Remove(mem, to_delete);
+    --num_flush_not_started_;
+    if (0 == num_flush_not_started_) {
+      imm_flush_needed.store(false, std::memory_order_release);
+    }
+  }
+
+  UpdateMemoryUsageExcludingLast();
+  ResetTrimHistoryNeeded();
+}
+
 }  // namespace rocksdb
diff --git a/db/memtable_list.h b/db/memtable_list.h
index b56ad4932c4..d78a8b5ea9e 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -16,14 +16,14 @@
 #include "db/logs_with_prep_tracker.h"
 #include "db/memtable.h"
 #include "db/range_del_aggregator.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
 #include "monitoring/instrumented_mutex.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
 #include "util/autovector.h"
-#include "util/filename.h"
-#include "util/log_buffer.h"
 
 namespace rocksdb {
 
@@ -33,6 +33,8 @@ class InstrumentedMutex;
 class MergeIteratorBuilder;
 class MemTableList;
 
+struct FlushJobInfo;
+
 // keeps a list of immutable memtables in a vector. the list is immutable
 // if refcount is bigger than one. It is used as a state for Get() and
 // Iterator code paths
@@ -44,7 +46,8 @@ class MemTableListVersion {
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
                                MemTableListVersion* old = nullptr);
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
-                               int max_write_buffer_number_to_maintain);
+                               int max_write_buffer_number_to_maintain,
+                               int64_t max_write_buffer_size_to_maintain);
 
   void Ref();
   void Unref(autovector<MemTable*>* to_delete = nullptr);
@@ -71,6 +74,16 @@ class MemTableListVersion {
                read_opts, callback, is_blob_index);
   }
 
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool* is_blob);
+
+  // Returns all the merge operands corresponding to the key by searching all
+  // memtables starting from the most recent one.
+  bool GetMergeOperands(const LookupKey& key, Status* s,
+                        MergeContext* merge_context,
+                        SequenceNumber* max_covering_tombstone_seq,
+                        const ReadOptions& read_opts);
+
   // Similar to Get(), but searches the Memtable history of memtables that
   // have already been flushed.  Should only be used from in-memory only
   // queries (such as Transaction validation) as the history may contain
@@ -132,7 +145,7 @@ class MemTableListVersion {
   // REQUIRE: m is an immutable memtable
   void Remove(MemTable* m, autovector<MemTable*>* to_delete);
 
-  void TrimHistory(autovector<MemTable*>* to_delete);
+  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
 
   bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
                    std::string* value, Status* s, MergeContext* merge_context,
@@ -145,6 +158,14 @@ class MemTableListVersion {
 
   void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
 
+  // Calculate the total amount of memory used by memlist_ and memlist_history_
+  // excluding the last MemTable in memlist_history_. The reason for excluding
+  // the last MemTable is to see if dropping the last MemTable will keep total
+  // memory usage above or equal to max_write_buffer_size_to_maintain_
+  size_t ApproximateMemoryUsageExcludingLast();
+
+  bool MemtableLimitExceeded(size_t usage);
+
   // Immutable MemTables that have not yet been flushed.
   std::list<MemTable*> memlist_;
 
@@ -153,8 +174,10 @@ class MemTableListVersion {
   std::list<MemTable*> memlist_history_;
 
   // Maximum number of MemTables to keep in memory (including both flushed
-  // and not-yet-flushed tables).
   const int max_write_buffer_number_to_maintain_;
+  // Maximum size of MemTables to keep in memory (including both flushed
+  // and not-yet-flushed tables).
+  const int64_t max_write_buffer_size_to_maintain_;
 
   int refs_ = 0;
 
@@ -169,35 +192,41 @@ class MemTableListVersion {
 // recoverability from a crash.
 //
 //
-// Other than imm_flush_needed, this class is not thread-safe and requires
-// external synchronization (such as holding the db mutex or being on the
-// write thread.)
+// Other than imm_flush_needed and imm_trim_needed, this class is not
+// thread-safe and requires external synchronization (such as holding the db
+// mutex or being on the write thread.)
 class MemTableList {
  public:
   // A list of memtables.
   explicit MemTableList(int min_write_buffer_number_to_merge,
-                        int max_write_buffer_number_to_maintain)
+                        int max_write_buffer_number_to_maintain,
+                        int64_t max_write_buffer_size_to_maintain)
       : imm_flush_needed(false),
+        imm_trim_needed(false),
         min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
         current_(new MemTableListVersion(&current_memory_usage_,
-                                         max_write_buffer_number_to_maintain)),
+                                         max_write_buffer_number_to_maintain,
+                                         max_write_buffer_size_to_maintain)),
         num_flush_not_started_(0),
         commit_in_progress_(false),
-        flush_requested_(false) {
+        flush_requested_(false),
+        current_memory_usage_(0),
+        current_memory_usage_excluding_last_(0) {
     current_->Ref();
-    current_memory_usage_ = 0;
   }
 
   // Should not delete MemTableList without making sure MemTableList::current()
   // is Unref()'d.
   ~MemTableList() {}
 
-  MemTableListVersion* current() { return current_; }
+  MemTableListVersion* current() const { return current_; }
 
   // so that background threads can detect non-nullptr pointer to
   // determine whether there is anything more to start flushing.
   std::atomic<bool> imm_flush_needed;
 
+  std::atomic<bool> imm_trim_needed;
+
   // Returns the total number of memtables in the list that haven't yet
   // been flushed and logged.
   int NumNotFlushed() const;
@@ -227,7 +256,8 @@ class MemTableList {
       const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
       VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
       autovector<MemTable*>* to_delete, Directory* db_directory,
-      LogBuffer* log_buffer);
+      LogBuffer* log_buffer,
+      std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
@@ -236,6 +266,18 @@ class MemTableList {
   // Returns an estimate of the number of bytes of data in use.
   size_t ApproximateMemoryUsage();
 
+  // Returns the cached current_memory_usage_excluding_last_ value
+  size_t ApproximateMemoryUsageExcludingLast();
+
+  // Update current_memory_usage_excluding_last_ from MemtableListVersion
+  void UpdateMemoryUsageExcludingLast();
+
+  // `usage` is the current size of the mutable Memtable. When
+  // max_write_buffer_size_to_maintain is used, total size of mutable and
+  // immutable memtables is checked against it to decide whether to trim
+  // memtable list.
+  void TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
   // Returns an estimate of the number of bytes of data used by
   // the unflushed mem-tables.
   size_t ApproximateUnflushedMemTablesMemoryUsage();
@@ -252,6 +294,20 @@ class MemTableList {
 
   bool HasFlushRequested() { return flush_requested_; }
 
+  // Returns true if a trim history should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkTrimHistoryNeeded() {
+    auto expected = false;
+    return imm_trim_needed.compare_exchange_strong(
+        expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
+  void ResetTrimHistoryNeeded() {
+    auto expected = true;
+    imm_trim_needed.compare_exchange_strong(
+        expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
   // Copying allowed
   // MemTableList(const MemTableList&);
   // void operator=(const MemTableList&);
@@ -294,6 +350,13 @@ class MemTableList {
     }
   }
 
+  // Used only by DBImplSecondary during log replay.
+  // Remove memtables whose data were written before the WAL with log_number
+  // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
+  // not freed, but put into a vector for future deref and reclamation.
+  void RemoveOldMemTables(uint64_t log_number,
+                          autovector<MemTable*>* to_delete);
+
  private:
   friend Status InstallMemtableAtomicFlushResults(
       const autovector<MemTableList*>* imm_lists,
@@ -324,6 +387,8 @@ class MemTableList {
 
   // The current memory usage.
   size_t current_memory_usage_;
+
+  std::atomic<size_t> current_memory_usage_excluding_last_;
 };
 
 // Installs memtable atomic flush results.
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index a14c13b893b..32a227f4b55 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -13,9 +13,9 @@
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/write_buffer_manager.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -100,7 +100,7 @@ class MemTableListTest : public testing::Test {
 
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
-                        &write_controller);
+                        &write_controller, /*block_cache_tracer=*/nullptr);
     std::vector<ColumnFamilyDescriptor> cf_descs;
     cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
     cf_descs.emplace_back("one", ColumnFamilyOptions());
@@ -117,9 +117,11 @@ class MemTableListTest : public testing::Test {
     // Create dummy mutex.
     InstrumentedMutex mutex;
     InstrumentedMutexLock l(&mutex);
-    return list->TryInstallMemtableFlushResults(
+    std::list<std::unique_ptr<FlushJobInfo>> flush_jobs_info;
+    Status s = list->TryInstallMemtableFlushResults(
         cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
-        file_num, to_delete, nullptr, &log_buffer);
+        file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info);
+    return s;
   }
 
   // Calls MemTableList::InstallMemtableFlushResults() and sets up all
@@ -144,7 +146,7 @@ class MemTableListTest : public testing::Test {
 
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
-                        &write_controller);
+                        &write_controller, /*block_cache_tracer=*/nullptr);
     std::vector<ColumnFamilyDescriptor> cf_descs;
     cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
     cf_descs.emplace_back("one", ColumnFamilyOptions());
@@ -183,7 +185,7 @@ class MemTableListTest : public testing::Test {
 
 TEST_F(MemTableListTest, Empty) {
   // Create an empty MemTableList and validate basic functions.
-  MemTableList list(1, 0);
+  MemTableList list(1, 0, 0);
 
   ASSERT_EQ(0, list.NumNotFlushed());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -202,8 +204,10 @@ TEST_F(MemTableListTest, GetTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
   int max_write_buffer_number_to_maintain = 0;
+  int64_t max_write_buffer_size_to_maintain = 0;
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain);
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
 
   SequenceNumber seq = 1;
   std::string value;
@@ -312,8 +316,10 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
   int max_write_buffer_number_to_maintain = 2;
+  int64_t max_write_buffer_size_to_maintain = 2000;
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain);
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
 
   SequenceNumber seq = 1;
   std::string value;
@@ -514,8 +520,11 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 3;
   int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int>(options.write_buffer_size);
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain);
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
 
   // Create some MemTables
   uint64_t memtable_id = 0;
@@ -670,7 +679,9 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // created. So TryInstallMemtableFlushResults will install the first 3 tables
   // in to_flush and stop when it encounters a table not yet flushed.
   ASSERT_EQ(2, list.NumNotFlushed());
-  int num_in_history = std::min(3, max_write_buffer_number_to_maintain);
+  int num_in_history =
+      std::min(3, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
   ASSERT_EQ(num_in_history, list.NumFlushed());
   ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
 
@@ -687,7 +698,9 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // This will actually install 2 tables.  The 1 we told it to flush, and also
   // tables[4] which has been waiting for tables[3] to commit.
   ASSERT_EQ(0, list.NumNotFlushed());
-  num_in_history = std::min(5, max_write_buffer_number_to_maintain);
+  num_in_history =
+      std::min(5, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
   ASSERT_EQ(num_in_history, list.NumFlushed());
   ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
 
@@ -730,7 +743,8 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   list.current()->Unref(&to_delete);
   int to_delete_size =
-      std::min(num_tables, max_write_buffer_number_to_maintain);
+      std::min(num_tables, static_cast<int>(max_write_buffer_size_to_maintain) /
+                               static_cast<int>(options.write_buffer_size));
   ASSERT_EQ(to_delete_size, to_delete.size());
 
   for (const auto& m : to_delete) {
@@ -769,10 +783,13 @@ TEST_F(MemTableListTest, AtomicFlusTest) {
   // Create MemTableLists
   int min_write_buffer_number_to_merge = 3;
   int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int64_t>(options.write_buffer_size);
   autovector<MemTableList*> lists;
   for (int i = 0; i != num_cfs; ++i) {
     lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
-                                        max_write_buffer_number_to_maintain));
+                                        max_write_buffer_number_to_maintain,
+                                        max_write_buffer_size_to_maintain));
   }
 
   autovector<uint32_t> cf_ids;
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index 4a4d2fb714e..b5ae924ffc6 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -201,7 +201,15 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
       // want. Also if we're in compaction and it's a put, it would be nice to
       // run compaction filter on it.
       const Slice val = iter->value();
-      const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr;
+      const Slice* val_ptr;
+      if (kTypeValue == ikey.type &&
+          (range_del_agg == nullptr ||
+           !range_del_agg->ShouldDelete(
+               ikey, RangeDelPositioningMode::kForwardTraversal))) {
+        val_ptr = &val;
+      } else {
+        val_ptr = nullptr;
+      }
       std::string merge_result;
       s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr,
                          merge_context_.GetOperands(), &merge_result, logger_,
diff --git a/db/merge_helper_test.cc b/db/merge_helper_test.cc
index b61092ee575..3386f9bd067 100644
--- a/db/merge_helper_test.cc
+++ b/db/merge_helper_test.cc
@@ -9,9 +9,9 @@
 
 #include "db/merge_helper.h"
 #include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 3bd4b9a6004..2965045d9df 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -7,6 +7,9 @@
 #include <memory>
 #include <iostream>
 
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
@@ -14,11 +17,8 @@
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
-#include "db/dbformat.h"
-#include "db/db_impl.h"
-#include "db/write_batch_internal.h"
+#include "test_util/testharness.h"
 #include "utilities/merge_operators.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index 52175a07b74..096a50c1aed 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -13,17 +13,19 @@
 #include <map>
 #include <string>
 #include <vector>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/transaction_log.h"
-#include "util/filename.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 using std::cerr;
 using std::cout;
@@ -32,60 +34,10 @@ using std::flush;
 
 namespace rocksdb {
 
-class ObsoleteFilesTest : public testing::Test {
+class ObsoleteFilesTest : public DBTestBase {
  public:
-  std::string dbname_;
-  Options options_;
-  DB* db_;
-  Env* env_;
-  int numlevels_;
-
-  ObsoleteFilesTest() {
-    db_ = nullptr;
-    env_ = Env::Default();
-    // Trigger compaction when the number of level 0 files reaches 2.
-    options_.level0_file_num_compaction_trigger = 2;
-    options_.disable_auto_compactions = false;
-    options_.delete_obsolete_files_period_micros = 0;  // always do full purge
-    options_.enable_thread_tracking = true;
-    options_.write_buffer_size = 1024*1024*1000;
-    options_.target_file_size_base = 1024*1024*1000;
-    options_.max_bytes_for_level_base = 1024*1024*1000;
-    options_.WAL_ttl_seconds = 300; // Used to test log files
-    options_.WAL_size_limit_MB = 1024; // Used to test log files
-    dbname_ = test::PerThreadDBPath("obsolete_files_test");
-    options_.wal_dir = dbname_ + "/wal_files";
-
-    // clean up all the files that might have been there before
-    std::vector<std::string> old_files;
-    env_->GetChildren(dbname_, &old_files);
-    for (auto file : old_files) {
-      env_->DeleteFile(dbname_ + "/" + file);
-    }
-    env_->GetChildren(options_.wal_dir, &old_files);
-    for (auto file : old_files) {
-      env_->DeleteFile(options_.wal_dir + "/" + file);
-    }
-
-    DestroyDB(dbname_, options_);
-    numlevels_ = 7;
-    EXPECT_OK(ReopenDB(true));
-  }
-
-  Status ReopenDB(bool create) {
-    delete db_;
-    if (create) {
-      DestroyDB(dbname_, options_);
-    }
-    db_ = nullptr;
-    options_.create_if_missing = create;
-    return DB::Open(options_, dbname_, &db_);
-  }
-
-  void CloseDB() {
-    delete db_;
-    db_ = nullptr;
-  }
+  ObsoleteFilesTest()
+      : DBTestBase("/obsolete_files_test"), wal_dir_(dbname_ + "/wal_files") {}
 
   void AddKeys(int numkeys, int startkey) {
     WriteOptions options;
@@ -98,50 +50,24 @@ class ObsoleteFilesTest : public testing::Test {
     }
   }
 
-  int numKeysInLevels(
-    std::vector<LiveFileMetaData> &metadata,
-    std::vector<int> *keysperlevel = nullptr) {
-
-    if (keysperlevel != nullptr) {
-      keysperlevel->resize(numlevels_);
-    }
-
-    int numKeys = 0;
-    for (size_t i = 0; i < metadata.size(); i++) {
-      int startkey = atoi(metadata[i].smallestkey.c_str());
-      int endkey = atoi(metadata[i].largestkey.c_str());
-      int numkeysinfile = (endkey - startkey + 1);
-      numKeys += numkeysinfile;
-      if (keysperlevel != nullptr) {
-        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
-      }
-      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
-              metadata[i].level, metadata[i].name.c_str(),
-              metadata[i].smallestkey.c_str(),
-              metadata[i].largestkey.c_str());
-    }
-    return numKeys;
-  }
-
   void createLevel0Files(int numFiles, int numKeysPerFile) {
     int startKey = 0;
-    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
     for (int i = 0; i < numFiles; i++) {
       AddKeys(numKeysPerFile, startKey);
       startKey += numKeysPerFile;
-      ASSERT_OK(dbi->TEST_FlushMemTable());
-      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
   }
 
-  void CheckFileTypeCounts(std::string& dir,
-                            int required_log,
-                            int required_sst,
-                            int required_manifest) {
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
     std::vector<std::string> filenames;
     env_->GetChildren(dir, &filenames);
 
-    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    int log_cnt = 0;
+    int sst_cnt = 0;
+    int manifest_cnt = 0;
     for (auto file : filenames) {
       uint64_t number;
       FileType type;
@@ -155,9 +81,31 @@ class ObsoleteFilesTest : public testing::Test {
     ASSERT_EQ(required_sst, sst_cnt);
     ASSERT_EQ(required_manifest, manifest_cnt);
   }
+
+  void ReopenDB() {
+    Options options = CurrentOptions();
+    // Trigger compaction when the number of level 0 files reaches 2.
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger = 2;
+    options.disable_auto_compactions = false;
+    options.delete_obsolete_files_period_micros = 0;  // always do full purge
+    options.enable_thread_tracking = true;
+    options.write_buffer_size = 1024 * 1024 * 1000;
+    options.target_file_size_base = 1024 * 1024 * 1000;
+    options.max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options.WAL_ttl_seconds = 300;     // Used to test log files
+    options.WAL_size_limit_MB = 1024;  // Used to test log files
+    options.wal_dir = wal_dir_;
+    Destroy(options);
+    Reopen(options);
+  }
+
+  const std::string wal_dir_;
 };
 
 TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
+  ReopenDB();
+  SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->LoadDependency({
       {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles",
        "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"},
@@ -171,35 +119,33 @@ TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
       });
   SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) {
-        std::vector<uint64_t>* files_grabbed_for_purge_ptr =
-            reinterpret_cast<std::vector<uint64_t>*>(arg);
+        std::unordered_set<uint64_t>* files_grabbed_for_purge_ptr =
+            reinterpret_cast<std::unordered_set<uint64_t>*>(arg);
         ASSERT_TRUE(files_grabbed_for_purge_ptr->empty());
       });
   SyncPoint::GetInstance()->EnableProcessing();
 
   createLevel0Files(2, 50000);
-  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
 
-  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
-  port::Thread user_thread([&]() {
+  port::Thread user_thread([this]() {
     JobContext jobCxt(0);
     TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1");
-    dbi->TEST_LockMutex();
-    dbi->FindObsoleteFiles(&jobCxt,
-      true /* force=true */, false /* no_full_scan=false */);
-    dbi->TEST_UnlockMutex();
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */,
+                                false /* no_full_scan=false */);
+    dbfull()->TEST_UnlockMutex();
     TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2");
-    dbi->PurgeObsoleteFiles(jobCxt);
+    dbfull()->PurgeObsoleteFiles(jobCxt);
     jobCxt.Clean();
   });
 
   user_thread.join();
-
-  CloseDB();
-  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
+  ReopenDB();
+  SyncPoint::GetInstance()->DisableProcessing();
   std::vector<uint64_t> optsfiles_nums;
   std::vector<bool> optsfiles_keep;
   SyncPoint::GetInstance()->SetCallBack(
@@ -213,23 +159,22 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   createLevel0Files(2, 50000);
-  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
 
-  DBImpl* dbi = static_cast<DBImpl*>(db_);
-  ASSERT_OK(dbi->DisableFileDeletions());
+  ASSERT_OK(dbfull()->DisableFileDeletions());
   for (int i = 0; i != 4; ++i) {
     if (i % 2) {
-      ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
-                                {{"paranoid_file_checks", "false"}}));
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "false"}}));
     } else {
-      ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
-                                {{"paranoid_file_checks", "true"}}));
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "true"}}));
     }
   }
-  ASSERT_OK(dbi->EnableFileDeletions(true /* force */));
+  ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */));
   ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size());
 
-  CloseDB();
+  Close();
 
   std::vector<std::string> files;
   int opts_file_count = 0;
@@ -246,13 +191,22 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
     }
   }
   ASSERT_EQ(2, opts_file_count);
-  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 } //namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 
diff --git a/db/options_file_test.cc b/db/options_file_test.cc
index 0a9a34ff0b5..b86ecefa97a 100644
--- a/db/options_file_test.cc
+++ b/db/options_file_test.cc
@@ -6,11 +6,11 @@
 #ifndef ROCKSDB_LITE
 #include <string>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 class OptionsFileTest : public testing::Test {
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index b7efec182a1..94eabff7ff5 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -17,9 +17,9 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
+#include "test_util/testharness.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
 #include "utilities/merge_operators.h"
 
 bool FLAGS_random_key = false;
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 2dd0cff0b41..fea1e563cf2 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -12,9 +12,11 @@
 #include <algorithm>
 #include <set>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "logging/logging.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
@@ -22,19 +24,17 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "table/bloom_block.h"
 #include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
-#include "table/plain_table_key_coding.h"
-#include "table/plain_table_reader.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/plain/plain_table_reader.h"
 #include "table/table_builder.h"
-#include "util/filename.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/hash.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 using std::unique_ptr;
@@ -142,6 +142,7 @@ class PlainTableDBTest : public testing::Test,
     options.prefix_extractor.reset(NewFixedPrefixTransform(8));
     options.allow_mmap_reads = mmap_mode_;
     options.allow_concurrent_memtable_write = false;
+    options.unordered_write = false;
     return options;
   }
 
@@ -301,6 +302,7 @@ class TestPlainTableReader : public PlainTableReader {
         EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
       }
     }
+    table_properties_.reset(props);
   }
 
   ~TestPlainTableReader() override {}
@@ -391,11 +393,72 @@ class TestPlainTableFactory : public PlainTableFactory {
   const std::string column_family_name_;
 };
 
+TEST_P(PlainTableDBTest, BadOptions1) {
+  // Build with a prefix extractor
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Bad attempt to re-open without a prefix extractor
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  Reopen(&options);
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor is missing when opening a PlainTable "
+      "built using a prefix extractor",
+      Get("1000000000000foo"));
+
+  // Bad attempt to re-open with different prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(6));
+  Reopen(&options);
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor given doesn't match the one used to "
+      "build PlainTable",
+      Get("1000000000000foo"));
+
+  // Correct prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, BadOptions2) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+  // Build without a prefix extractor
+  // (apparently works even if hash_table_ratio > 0)
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  dbfull()->TEST_FlushMemTable();
+
+  // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor
+  Status s = TryReopen(&options);
+  ASSERT_EQ(
+      "Not implemented: PlainTable requires a prefix extractor enable prefix "
+      "hash mode.",
+      s.ToString());
+
+  // OK to open with hash_table_ratio == 0 and no prefix extractor
+  PlainTableOptions plain_table_options;
+  plain_table_options.hash_table_ratio = 0;
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+
+  // OK to open newly with a prefix_extractor and hash table; builds index
+  // in memory.
+  options = CurrentOptions();
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
 TEST_P(PlainTableDBTest, Flush) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
     for (EncodingType encoding_type : {kPlain, kPrefix}) {
-    for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+    for (int bloom = -1; bloom <= 117; bloom += 117) {
+      const int bloom_bits = std::max(bloom, 0);
+      const bool full_scan_mode = bloom < 0;
       for (int total_order = 0; total_order <= 1; total_order++) {
         for (int store_index_in_file = 0; store_index_in_file <= 1;
              ++store_index_in_file) {
@@ -413,7 +476,7 @@ TEST_P(PlainTableDBTest, Flush) {
             plain_table_options.index_sparseness = 2;
             plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
             plain_table_options.encoding_type = encoding_type;
-            plain_table_options.full_scan_mode = false;
+            plain_table_options.full_scan_mode = full_scan_mode;
             plain_table_options.store_index_in_file = store_index_in_file;
 
             options.table_factory.reset(
@@ -426,7 +489,7 @@ TEST_P(PlainTableDBTest, Flush) {
             plain_table_options.index_sparseness = 16;
             plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
             plain_table_options.encoding_type = encoding_type;
-            plain_table_options.full_scan_mode = false;
+            plain_table_options.full_scan_mode = full_scan_mode;
             plain_table_options.store_index_in_file = store_index_in_file;
 
             options.table_factory.reset(
@@ -453,20 +516,36 @@ TEST_P(PlainTableDBTest, Flush) {
           auto row = ptc.begin();
           auto tp = row->second;
 
-          if (!store_index_in_file) {
-            ASSERT_EQ(total_order ? "4" : "12",
-                      (tp->user_collected_properties)
-                          .at("plain_table_hash_table_size"));
-            ASSERT_EQ("0", (tp->user_collected_properties)
-                               .at("plain_table_sub_index_size"));
+          if (full_scan_mode) {
+            // Does not support Get/Seek
+            std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+            iter->SeekToFirst();
+            ASSERT_TRUE(iter->Valid());
+            ASSERT_EQ("0000000000000bar", iter->key().ToString());
+            ASSERT_EQ("v2", iter->value().ToString());
+            iter->Next();
+            ASSERT_TRUE(iter->Valid());
+            ASSERT_EQ("1000000000000foo", iter->key().ToString());
+            ASSERT_EQ("v3", iter->value().ToString());
+            iter->Next();
+            ASSERT_TRUE(!iter->Valid());
+            ASSERT_TRUE(iter->status().ok());
           } else {
-            ASSERT_EQ("0", (tp->user_collected_properties)
-                               .at("plain_table_hash_table_size"));
-            ASSERT_EQ("0", (tp->user_collected_properties)
-                               .at("plain_table_sub_index_size"));
+            if (!store_index_in_file) {
+              ASSERT_EQ(total_order ? "4" : "12",
+                        (tp->user_collected_properties)
+                            .at("plain_table_hash_table_size"));
+              ASSERT_EQ("0", (tp->user_collected_properties)
+                                 .at("plain_table_sub_index_size"));
+            } else {
+              ASSERT_EQ("0", (tp->user_collected_properties)
+                                 .at("plain_table_hash_table_size"));
+              ASSERT_EQ("0", (tp->user_collected_properties)
+                                 .at("plain_table_sub_index_size"));
+            }
+            ASSERT_EQ("v3", Get("1000000000000foo"));
+            ASSERT_EQ("v2", Get("0000000000000bar"));
           }
-          ASSERT_EQ("v3", Get("1000000000000foo"));
-          ASSERT_EQ("v2", Get("0000000000000bar"));
         }
         }
       }
@@ -729,6 +808,64 @@ TEST_P(PlainTableDBTest, Iterator) {
   }
 }
 
+namespace {
+std::string NthKey(size_t n, char filler) {
+  std::string rv(16, filler);
+  rv[0] = n % 10;
+  rv[1] = (n / 10) % 10;
+  rv[2] = (n / 100) % 10;
+  rv[3] = (n / 1000) % 10;
+  return rv;
+}
+}  // anonymous namespace
+
+TEST_P(PlainTableDBTest, BloomSchema) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) {
+    options.bloom_locality = bloom_locality;
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 16;
+    plain_table_options.bloom_bits_per_key = 3;  // high FP rate for test
+    plain_table_options.hash_table_ratio = 0.75;
+    plain_table_options.index_sparseness = 16;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+
+    bool expect_bloom_not_match = false;
+    options.table_factory.reset(new TestPlainTableFactory(
+        &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */,
+        kDefaultColumnFamilyName));
+    DestroyAndReopen(&options);
+
+    for (unsigned i = 0; i < 2345; ++i) {
+      ASSERT_OK(Put(NthKey(i, 'y'), "added"));
+    }
+    dbfull()->TEST_FlushMemTable();
+    ASSERT_EQ("added", Get(NthKey(42, 'y')));
+
+    for (unsigned i = 0; i < 32; ++i) {
+      // Known pattern of Bloom filter false positives can detect schema change
+      // with high probability. Known FPs stuffed into bits:
+      uint32_t pattern;
+      if (!bloom_locality) {
+        pattern = 1785868347UL;
+      } else if (CACHE_LINE_SIZE == 64U) {
+        pattern = 2421694657UL;
+      } else if (CACHE_LINE_SIZE == 128U) {
+        pattern = 788710956UL;
+      } else {
+        ASSERT_EQ(CACHE_LINE_SIZE, 256U);
+        pattern = 163905UL;
+      }
+      bool expect_fp = pattern & (1UL << i);
+      // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp);
+      expect_bloom_not_match = !expect_fp;
+      ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n')));
+    }
+  }
+}
+
 namespace {
 std::string MakeLongKey(size_t length, char c) {
   return std::string(length, c);
diff --git a/db/pre_release_callback.h b/db/pre_release_callback.h
index f91ef1b27ac..e4167904ff8 100644
--- a/db/pre_release_callback.h
+++ b/db/pre_release_callback.h
@@ -27,8 +27,12 @@ class PreReleaseCallback {
   // is_mem_disabled is currently used for debugging purposes to assert that
   // the callback is done from the right write queue.
   // If non-zero, log_number indicates the WAL log to which we wrote.
+  // index >= 0 specifies the order of callback in the same write thread.
+  // total > index specifies the total number of callbacks in the same write
+  // thread. Together with index, could be used to reduce the redundant
+  // operations among the callbacks.
   virtual Status Callback(SequenceNumber seq, bool is_mem_disabled,
-                          uint64_t log_number) = 0;
+                          uint64_t log_number, size_t index, size_t total) = 0;
 };
 
 }  //  namespace rocksdb
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index ac854cb3dbd..19f02f1099a 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -17,7 +17,7 @@ int main() {
 #include <iostream>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
@@ -26,12 +26,12 @@ int main() {
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "test_util/testharness.h"
 #include "util/coding.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
 #include "utilities/merge_operators.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
@@ -751,7 +751,7 @@ TEST_F(PrefixTest, PrefixSeekModePrev) {
       for (size_t k = 0; k < 9; k++) {
         if (rnd.OneIn(2) || it == whole_map.begin()) {
           iter->Next();
-          it++;
+          ++it;
           if (FLAGS_enable_print) {
             std::cout << "Next >> ";
           }
diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc
index 8f86528ecb2..7c188aeaa07 100644
--- a/db/range_del_aggregator.cc
+++ b/db/range_del_aggregator.cc
@@ -5,7 +5,7 @@
 
 #include "db/range_del_aggregator.h"
 
-#include "db/compaction_iteration_stats.h"
+#include "db/compaction/compaction_iteration_stats.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_del_aggregator.h"
diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h
index e593807d548..96cfb581309 100644
--- a/db/range_del_aggregator.h
+++ b/db/range_del_aggregator.h
@@ -13,7 +13,7 @@
 #include <string>
 #include <vector>
 
-#include "db/compaction_iteration_stats.h"
+#include "db/compaction/compaction_iteration_stats.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_del_aggregator.h"
@@ -320,8 +320,10 @@ class RangeDelAggregator {
                       RangeDelPositioningMode mode);
 
     void Invalidate() {
-      InvalidateForwardIter();
-      InvalidateReverseIter();
+      if (!IsEmpty()) {
+        InvalidateForwardIter();
+        InvalidateReverseIter();
+      }
     }
 
     bool IsRangeOverlapped(const Slice& start, const Slice& end);
diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc
index 34b2f7e5db1..97ba6ca4f8a 100644
--- a/db/range_del_aggregator_bench.cc
+++ b/db/range_del_aggregator_bench.cc
@@ -23,10 +23,10 @@ int main() {
 #include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
-#include "util/testutil.h"
 
 #include "util/gflags_compat.h"
 
diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc
index 28c8129ecb0..7ce666326a8 100644
--- a/db/range_del_aggregator_test.cc
+++ b/db/range_del_aggregator_test.cc
@@ -12,7 +12,7 @@
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
 #include "db/range_tombstone_fragmenter.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc
index e3eb18908a5..fe64623fee5 100644
--- a/db/range_tombstone_fragmenter.cc
+++ b/db/range_tombstone_fragmenter.cc
@@ -9,8 +9,8 @@
 #include <functional>
 #include <set>
 
-#include <inttypes.h>
 #include <stdio.h>
+#include <cinttypes>
 
 #include "util/autovector.h"
 #include "util/kv_map.h"
diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h
index a0b77b67771..23a28396fd4 100644
--- a/db/range_tombstone_fragmenter.h
+++ b/db/range_tombstone_fragmenter.h
@@ -144,6 +144,8 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
   void Invalidate() {
     pos_ = tombstones_->end();
     seq_pos_ = tombstones_->seq_end();
+    pinned_pos_ = tombstones_->end();
+    pinned_seq_pos_ = tombstones_->seq_end();
   }
 
   RangeTombstone Tombstone() const {
diff --git a/db/range_tombstone_fragmenter_test.cc b/db/range_tombstone_fragmenter_test.cc
index ddd3f774176..11f3574967d 100644
--- a/db/range_tombstone_fragmenter_test.cc
+++ b/db/range_tombstone_fragmenter_test.cc
@@ -7,7 +7,7 @@
 
 #include "db/db_test_util.h"
 #include "rocksdb/comparator.h"
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/read_callback.h b/db/read_callback.h
index 60f91ef872d..d8801e65173 100644
--- a/db/read_callback.h
+++ b/db/read_callback.h
@@ -42,9 +42,6 @@ class ReadCallback {
   // Refresh to a more recent visible seq
   virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; }
 
-  // Refer to DBIter::CanReseekToSkip
-  virtual bool CanReseekToSkip() { return true; }
-
  protected:
   // The max visible seq, it is usually the snapshot but could be larger if
   // transaction has its own writes written to db.
diff --git a/db/repair.cc b/db/repair.cc
index 2715adcf129..b71f725a285 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -34,6 +34,7 @@
 // We scan every table to compute
 // (1) smallest/largest for the table
 // (2) largest sequence number in the table
+// (3) oldest blob file referred to by the table (if applicable)
 //
 // If we are unable to scan the file, then we ignore the table.
 //
@@ -60,13 +61,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include "db/builder.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -74,6 +71,8 @@
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
@@ -81,8 +80,6 @@
 #include "rocksdb/options.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -113,11 +110,13 @@ class Repairer {
             // once.
             NewLRUCache(10, db_options_.table_cache_numshardbits)),
         table_cache_(new TableCache(default_cf_iopts_, env_options_,
-                                    raw_table_cache_.get())),
+                                    raw_table_cache_.get(),
+                                    /*block_cache_tracer=*/nullptr)),
         wb_(db_options_.db_write_buffer_size),
         wc_(db_options_.delayed_write_rate),
         vset_(dbname_, &immutable_db_options_, env_options_,
-              raw_table_cache_.get(), &wb_, &wc_),
+              raw_table_cache_.get(), &wb_, &wc_,
+              /*block_cache_tracer=*/nullptr),
         next_file_number_(1),
         db_lock_(nullptr) {
     for (const auto& cfd : column_families) {
@@ -226,8 +225,6 @@ class Repairer {
     FileMetaData meta;
     uint32_t column_family_id;
     std::string column_family_name;
-    SequenceNumber min_sequence;
-    SequenceNumber max_sequence;
   };
 
   std::string const dbname_;
@@ -385,7 +382,8 @@ class Repairer {
         continue;
       }
       WriteBatchInternal::SetContents(&batch, record);
-      status = WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr);
+      status =
+          WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
       if (status.ok()) {
         counter += WriteBatchInternal::Count(&batch);
       } else {
@@ -501,6 +499,7 @@ class Repairer {
         status =
             AddColumnFamily(props->column_family_name, t->column_family_id);
       }
+      t->meta.oldest_ancester_time = props->creation_time;
     }
     ColumnFamilyData* cfd = nullptr;
     if (status.ok()) {
@@ -522,11 +521,12 @@ class Repairer {
       InternalIterator* iter = table_cache_->NewIterator(
           ropts, env_options_, cfd->internal_comparator(), t->meta,
           nullptr /* range_del_agg */,
-          cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
-      bool empty = true;
+          cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
+          /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+          TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
+          /*level=*/-1, /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr);
       ParsedInternalKey parsed;
-      t->min_sequence = 0;
-      t->max_sequence = 0;
       for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
         Slice key = iter->key();
         if (!ParseInternalKey(key, &parsed)) {
@@ -537,18 +537,9 @@ class Repairer {
         }
 
         counter++;
-        if (empty) {
-          empty = false;
-          t->meta.smallest.DecodeFrom(key);
-          t->min_sequence = parsed.sequence;
-        }
-        t->meta.largest.DecodeFrom(key);
-        if (parsed.sequence < t->min_sequence) {
-          t->min_sequence = parsed.sequence;
-        }
-        if (parsed.sequence > t->max_sequence) {
-          t->max_sequence = parsed.sequence;
-        }
+
+        t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
+                                 parsed.type);
       }
       if (!iter->status().ok()) {
         status = iter->status();
@@ -567,8 +558,8 @@ class Repairer {
     SequenceNumber max_sequence = 0;
     for (size_t i = 0; i < tables_.size(); i++) {
       cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
-      if (max_sequence < tables_[i].max_sequence) {
-        max_sequence = tables_[i].max_sequence;
+      if (max_sequence < tables_[i].meta.fd.largest_seqno) {
+        max_sequence = tables_[i].meta.fd.largest_seqno;
       }
     }
     vset_.SetLastAllocatedSequence(max_sequence);
@@ -586,10 +577,13 @@ class Repairer {
 
       // TODO(opt): separate out into multiple levels
       for (const auto* table : cf_id_and_tables.second) {
-        edit.AddFile(0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
-                     table->meta.fd.GetFileSize(), table->meta.smallest,
-                     table->meta.largest, table->min_sequence,
-                     table->max_sequence, table->meta.marked_for_compaction);
+        edit.AddFile(
+            0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
+            table->meta.fd.GetFileSize(), table->meta.smallest,
+            table->meta.largest, table->meta.fd.smallest_seqno,
+            table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
+            table->meta.oldest_blob_file_number,
+            table->meta.oldest_ancester_time, table->meta.file_creation_time);
       }
       assert(next_file_number_ > 0);
       vset_.MarkFileNumberUsed(next_file_number_ - 1);
diff --git a/db/repair_test.cc b/db/repair_test.cc
index 3422532da4b..21907e43575 100644
--- a/db/repair_test.cc
+++ b/db/repair_test.cc
@@ -9,12 +9,12 @@
 #include <string>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
+#include "file/file_util.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/transaction_log.h"
-#include "util/file_util.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 06255d6a354..89acd3d84e6 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -11,21 +11,23 @@
 
 #include "db/dbformat.h"
 #include "db/range_tombstone_fragmenter.h"
+#include "db/snapshot_impl.h"
 #include "db/version_edit.h"
-#include "util/filename.h"
-
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/statistics.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
 #include "table/multiget_context.h"
 #include "table/table_builder.h"
 #include "table/table_reader.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -43,13 +45,6 @@ static void UnrefEntry(void* arg1, void* arg2) {
   cache->Release(h);
 }
 
-static void DeleteTableReader(void* arg1, void* arg2) {
-  TableReader* table_reader = reinterpret_cast<TableReader*>(arg1);
-  Statistics* stats = reinterpret_cast<Statistics*>(arg2);
-  RecordTick(stats, NO_FILE_CLOSES);
-  delete table_reader;
-}
-
 static Slice GetSliceForFileNumber(const uint64_t* file_number) {
   return Slice(reinterpret_cast<const char*>(file_number),
                sizeof(*file_number));
@@ -68,11 +63,13 @@ void AppendVarint64(IterKey* key, uint64_t v) {
 }  // namespace
 
 TableCache::TableCache(const ImmutableCFOptions& ioptions,
-                       const EnvOptions& env_options, Cache* const cache)
+                       const EnvOptions& env_options, Cache* const cache,
+                       BlockCacheTracer* const block_cache_tracer)
     : ioptions_(ioptions),
       env_options_(env_options),
       cache_(cache),
-      immortal_tables_(false) {
+      immortal_tables_(false),
+      block_cache_tracer_(block_cache_tracer) {
   if (ioptions_.row_cache) {
     // If the same cache is shared by multiple instances, we need to
     // disambiguate its entries.
@@ -94,10 +91,10 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
 Status TableCache::GetTableReader(
     const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
-    bool sequential_mode, size_t readahead, bool record_read_stats,
-    HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
+    bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist,
+    std::unique_ptr<TableReader>* table_reader,
     const SliceTransform* prefix_extractor, bool skip_filters, int level,
-    bool prefetch_index_and_filter_in_cache, bool for_compaction) {
+    bool prefetch_index_and_filter_in_cache) {
   std::string fname =
       TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId());
   std::unique_ptr<RandomAccessFile> file;
@@ -105,13 +102,6 @@ Status TableCache::GetTableReader(
 
   RecordTick(ioptions_.statistics, NO_FILE_OPENS);
   if (s.ok()) {
-    if (readahead > 0 && !env_options.use_mmap_reads) {
-      // Not compatible with mmap files since ReadaheadRandomAccessFile requires
-      // its wrapped file's Read() to copy data into the provided scratch
-      // buffer, which mmap files don't use.
-      // TODO(ajkr): try madvise for mmap files in place of buffered readahead.
-      file = NewReadaheadRandomAccessFile(std::move(file), readahead);
-    }
     if (!sequential_mode && ioptions_.advise_random_on_open) {
       file->Hint(RandomAccessFile::RANDOM);
     }
@@ -120,12 +110,11 @@ Status TableCache::GetTableReader(
         new RandomAccessFileReader(
             std::move(file), fname, ioptions_.env,
             record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
-            file_read_hist, ioptions_.rate_limiter, for_compaction,
-            ioptions_.listeners));
+            file_read_hist, ioptions_.rate_limiter, ioptions_.listeners));
     s = ioptions_.table_factory->NewTableReader(
         TableReaderOptions(ioptions_, prefix_extractor, env_options,
                            internal_comparator, skip_filters, immortal_tables_,
-                           level, fd.largest_seqno),
+                           level, fd.largest_seqno, block_cache_tracer_),
         std::move(file_reader), fd.GetFileSize(), table_reader,
         prefetch_index_and_filter_in_cache);
     TEST_SYNC_POINT("TableCache::GetTableReader:0");
@@ -162,10 +151,9 @@ Status TableCache::FindTable(const EnvOptions& env_options,
     }
     std::unique_ptr<TableReader> table_reader;
     s = GetTableReader(env_options, internal_comparator, fd,
-                       false /* sequential mode */, 0 /* readahead */,
-                       record_read_stats, file_read_hist, &table_reader,
-                       prefix_extractor, skip_filters, level,
-                       prefetch_index_and_filter_in_cache);
+                       false /* sequential mode */, record_read_stats,
+                       file_read_hist, &table_reader, prefix_extractor,
+                       skip_filters, level, prefetch_index_and_filter_in_cache);
     if (!s.ok()) {
       assert(table_reader == nullptr);
       RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
@@ -188,54 +176,27 @@ InternalIterator* TableCache::NewIterator(
     const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
     RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
     TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
-    bool for_compaction, Arena* arena, bool skip_filters, int level,
+    TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
     const InternalKey* smallest_compaction_key,
     const InternalKey* largest_compaction_key) {
   PERF_TIMER_GUARD(new_table_iterator_nanos);
 
   Status s;
-  bool create_new_table_reader = false;
   TableReader* table_reader = nullptr;
   Cache::Handle* handle = nullptr;
   if (table_reader_ptr != nullptr) {
     *table_reader_ptr = nullptr;
   }
-  size_t readahead = 0;
-  if (for_compaction) {
-#ifndef NDEBUG
-    bool use_direct_reads_for_compaction = env_options.use_direct_reads;
-    TEST_SYNC_POINT_CALLBACK("TableCache::NewIterator:for_compaction",
-                             &use_direct_reads_for_compaction);
-#endif  // !NDEBUG
-    if (ioptions_.new_table_reader_for_compaction_inputs) {
-      // get compaction_readahead_size from env_options allows us to set the
-      // value dynamically
-      readahead = env_options.compaction_readahead_size;
-      create_new_table_reader = true;
-    }
-  }
-
+  bool for_compaction = caller == TableReaderCaller::kCompaction;
   auto& fd = file_meta.fd;
-  if (create_new_table_reader) {
-    std::unique_ptr<TableReader> table_reader_unique_ptr;
-    s = GetTableReader(
-        env_options, icomparator, fd, true /* sequential_mode */, readahead,
-        !for_compaction /* record stats */, nullptr, &table_reader_unique_ptr,
-        prefix_extractor, false /* skip_filters */, level,
-        true /* prefetch_index_and_filter_in_cache */, for_compaction);
+  table_reader = fd.table_reader;
+  if (table_reader == nullptr) {
+    s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor,
+                  options.read_tier == kBlockCacheTier /* no_io */,
+                  !for_compaction /* record_read_stats */, file_read_hist,
+                  skip_filters, level);
     if (s.ok()) {
-      table_reader = table_reader_unique_ptr.release();
-    }
-  } else {
-    table_reader = fd.table_reader;
-    if (table_reader == nullptr) {
-      s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor,
-                    options.read_tier == kBlockCacheTier /* no_io */,
-                    !for_compaction /* record read_stats */, file_read_hist,
-                    skip_filters, level);
-      if (s.ok()) {
-        table_reader = GetTableReaderFromHandle(handle);
-      }
+      table_reader = GetTableReaderFromHandle(handle);
     }
   }
   InternalIterator* result = nullptr;
@@ -245,13 +206,10 @@ InternalIterator* TableCache::NewIterator(
       result = NewEmptyInternalIterator<Slice>(arena);
     } else {
       result = table_reader->NewIterator(options, prefix_extractor, arena,
-                                         skip_filters, for_compaction);
+                                         skip_filters, caller,
+                                         env_options.compaction_readahead_size);
     }
-    if (create_new_table_reader) {
-      assert(handle == nullptr);
-      result->RegisterCleanup(&DeleteTableReader, table_reader,
-                              ioptions_.statistics);
-    } else if (handle != nullptr) {
+    if (handle != nullptr) {
       result->RegisterCleanup(&UnrefEntry, cache_, handle);
       handle = nullptr;  // prevent from releasing below
     }
@@ -296,6 +254,101 @@ InternalIterator* TableCache::NewIterator(
   return result;
 }
 
+Status TableCache::GetRangeTombstoneIterator(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
+  const FileDescriptor& fd = file_meta.fd;
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (t == nullptr) {
+    s = FindTable(env_options_, internal_comparator, fd, &handle);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+  }
+  if (s.ok()) {
+    out_iter->reset(t->NewRangeTombstoneIterator(options));
+    assert(out_iter);
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
+                                         const FileDescriptor& fd,
+                                         const Slice& internal_key,
+                                         GetContext* get_context,
+                                         IterKey& row_cache_key) {
+  uint64_t fd_number = fd.GetNumber();
+  // We use the user key as cache key instead of the internal key,
+  // otherwise the whole cache would be invalidated every time the
+  // sequence key increases. However, to support caching snapshot
+  // reads, we append the sequence number (incremented by 1 to
+  // distinguish from 0) only in this case.
+  // If the snapshot is larger than the largest seqno in the file,
+  // all data should be exposed to the snapshot, so we treat it
+  // the same as there is no snapshot. The exception is that if
+  // a seq-checking callback is registered, some internal keys
+  // may still be filtered out.
+  uint64_t seq_no = 0;
+  // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+  if (options.snapshot != nullptr &&
+      (get_context->has_callback() ||
+       static_cast_with_check<const SnapshotImpl, const Snapshot>(
+           options.snapshot)
+               ->GetSequenceNumber() <= fd.largest_seqno)) {
+    // We should consider to use options.snapshot->GetSequenceNumber()
+    // instead of GetInternalKeySeqno(k), which will make the code
+    // easier to understand.
+    seq_no = 1 + GetInternalKeySeqno(internal_key);
+  }
+
+  // Compute row cache key.
+  row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
+                           row_cache_id_.size());
+  AppendVarint64(&row_cache_key, fd_number);
+  AppendVarint64(&row_cache_key, seq_no);
+}
+
+bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                                 size_t prefix_size, GetContext* get_context) {
+  bool found = false;
+
+  row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
+  if (auto row_handle =
+          ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
+    // Cleanable routine to release the cache entry
+    Cleanable value_pinner;
+    auto release_cache_entry_func = [](void* cache_to_clean,
+                                       void* cache_handle) {
+      ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
+    };
+    auto found_row_cache_entry =
+        static_cast<const std::string*>(ioptions_.row_cache->Value(row_handle));
+    // If it comes here value is located on the cache.
+    // found_row_cache_entry points to the value on cache,
+    // and value_pinner has cleanup procedure for the cached entry.
+    // After replayGetContextLog() returns, get_context.pinnable_slice_
+    // will point to cache entry buffer (or a copy based on that) and
+    // cleanup routine under value_pinner will be delegated to
+    // get_context.pinnable_slice_. Cache entry is released when
+    // get_context.pinnable_slice_ is reset.
+    value_pinner.RegisterCleanup(release_cache_entry_func,
+                                 ioptions_.row_cache.get(), row_handle);
+    replayGetContextLog(*found_row_cache_entry, user_key, get_context,
+                        &value_pinner);
+    RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
+    found = true;
+  } else {
+    RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+  }
+  return found;
+}
+#endif  // ROCKSDB_LITE
+
 Status TableCache::Get(const ReadOptions& options,
                        const InternalKeyComparator& internal_comparator,
                        const FileMetaData& file_meta, const Slice& k,
@@ -313,51 +366,11 @@ Status TableCache::Get(const ReadOptions& options,
   // Check row cache if enabled. Since row cache does not currently store
   // sequence numbers, we cannot use it if we need to fetch the sequence.
   if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
-    uint64_t fd_number = fd.GetNumber();
     auto user_key = ExtractUserKey(k);
-    // We use the user key as cache key instead of the internal key,
-    // otherwise the whole cache would be invalidated every time the
-    // sequence key increases. However, to support caching snapshot
-    // reads, we append the sequence number (incremented by 1 to
-    // distinguish from 0) only in this case.
-    uint64_t seq_no =
-        options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k);
-
-    // Compute row cache key.
-    row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
-                             row_cache_id_.size());
-    AppendVarint64(&row_cache_key, fd_number);
-    AppendVarint64(&row_cache_key, seq_no);
-    row_cache_key.TrimAppend(row_cache_key.Size(), user_key.data(),
-                             user_key.size());
-
-    if (auto row_handle =
-            ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
-      // Cleanable routine to release the cache entry
-      Cleanable value_pinner;
-      auto release_cache_entry_func = [](void* cache_to_clean,
-                                         void* cache_handle) {
-        ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
-      };
-      auto found_row_cache_entry = static_cast<const std::string*>(
-          ioptions_.row_cache->Value(row_handle));
-      // If it comes here value is located on the cache.
-      // found_row_cache_entry points to the value on cache,
-      // and value_pinner has cleanup procedure for the cached entry.
-      // After replayGetContextLog() returns, get_context.pinnable_slice_
-      // will point to cache entry buffer (or a copy based on that) and
-      // cleanup routine under value_pinner will be delegated to
-      // get_context.pinnable_slice_. Cache entry is released when
-      // get_context.pinnable_slice_ is reset.
-      value_pinner.RegisterCleanup(release_cache_entry_func,
-                                   ioptions_.row_cache.get(), row_handle);
-      replayGetContextLog(*found_row_cache_entry, user_key, get_context,
-                          &value_pinner);
-      RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
-      done = true;
-    } else {
-      // Not found, setting up the replay log.
-      RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+    CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
+    done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
+                           get_context);
+    if (!done) {
       row_cache_entry = &row_cache_entry_buffer;
     }
   }
@@ -417,8 +430,6 @@ Status TableCache::Get(const ReadOptions& options,
 }
 
 // Batched version of TableCache::MultiGet.
-// TODO: Add support for row cache. As of now, this ignores the row cache
-// and directly looks up in the table files
 Status TableCache::MultiGet(const ReadOptions& options,
                             const InternalKeyComparator& internal_comparator,
                             const FileMetaData& file_meta,
@@ -430,7 +441,44 @@ Status TableCache::MultiGet(const ReadOptions& options,
   Status s;
   TableReader* t = fd.table_reader;
   Cache::Handle* handle = nullptr;
-  if (s.ok()) {
+  MultiGetRange table_range(*mget_range, mget_range->begin(),
+                            mget_range->end());
+#ifndef ROCKSDB_LITE
+  autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
+  IterKey row_cache_key;
+  size_t row_cache_key_prefix_size = 0;
+  KeyContext& first_key = *table_range.begin();
+  bool lookup_row_cache =
+      ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
+
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (lookup_row_cache) {
+    GetContext* first_context = first_key.get_context;
+    CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
+                            row_cache_key);
+    row_cache_key_prefix_size = row_cache_key.Size();
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      const Slice& user_key = miter->ukey;
+      ;
+      GetContext* get_context = miter->get_context;
+
+      if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
+                          get_context)) {
+        table_range.SkipKey(miter);
+      } else {
+        row_cache_entries.emplace_back();
+        get_context->SetReplayLog(&(row_cache_entries.back()));
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  // Check that table_range is not empty. Its possible all keys may have been
+  // found in the row cache and thus the range may now be empty
+  if (s.ok() && !table_range.empty()) {
     if (t == nullptr) {
       s = FindTable(
           env_options_, internal_comparator, fd, &handle, prefix_extractor,
@@ -445,21 +493,20 @@ Status TableCache::MultiGet(const ReadOptions& options,
       std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
           t->NewRangeTombstoneIterator(options));
       if (range_del_iter != nullptr) {
-        for (auto iter = mget_range->begin(); iter != mget_range->end();
+        for (auto iter = table_range.begin(); iter != table_range.end();
              ++iter) {
-          const Slice& k = iter->ikey;
           SequenceNumber* max_covering_tombstone_seq =
               iter->get_context->max_covering_tombstone_seq();
-          *max_covering_tombstone_seq = std::max(
-              *max_covering_tombstone_seq,
-              range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)));
+          *max_covering_tombstone_seq =
+              std::max(*max_covering_tombstone_seq,
+                       range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey));
         }
       }
     }
     if (s.ok()) {
-      t->MultiGet(options, mget_range, prefix_extractor, skip_filters);
+      t->MultiGet(options, &table_range, prefix_extractor, skip_filters);
     } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
-      for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) {
+      for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
         Status* status = iter->s;
         if (status->IsIncomplete()) {
           // Couldn't find Table in cache but treat as kFound if no_io set
@@ -470,6 +517,33 @@ Status TableCache::MultiGet(const ReadOptions& options,
     }
   }
 
+#ifndef ROCKSDB_LITE
+  if (lookup_row_cache) {
+    size_t row_idx = 0;
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      std::string& row_cache_entry = row_cache_entries[row_idx++];
+      const Slice& user_key = miter->ukey;
+      ;
+      GetContext* get_context = miter->get_context;
+
+      get_context->SetReplayLog(nullptr);
+      // Compute row cache key.
+      row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
+                               user_key.size());
+      // Put the replay log in row cache only if something was found.
+      if (s.ok() && !row_cache_entry.empty()) {
+        size_t charge =
+            row_cache_key.Size() + row_cache_entry.size() + sizeof(std::string);
+        void* row_ptr = new std::string(std::move(row_cache_entry));
+        ioptions_.row_cache->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                                    &DeleteEntry<std::string>);
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
   if (handle != nullptr) {
     ReleaseHandle(handle);
   }
@@ -531,4 +605,57 @@ void TableCache::Evict(Cache* cache, uint64_t file_number) {
   cache->Erase(GetSliceForFileNumber(&file_number));
 }
 
+uint64_t TableCache::ApproximateOffsetOf(
+    const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
+    const InternalKeyComparator& internal_comparator,
+    const SliceTransform* prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s = FindTable(env_options_, internal_comparator, fd, &table_handle,
+                         prefix_extractor, false /* no_io */,
+                         !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateOffsetOf(key, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
+
+uint64_t TableCache::ApproximateSize(
+    const Slice& start, const Slice& end, const FileDescriptor& fd,
+    TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
+    const SliceTransform* prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s = FindTable(env_options_, internal_comparator, fd, &table_handle,
+                         prefix_extractor, false /* no_io */,
+                         !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateSize(start, end, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
 }  // namespace rocksdb
diff --git a/db/table_cache.h b/db/table_cache.h
index 1e96dfa1bd5..088040672d8 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -23,6 +23,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "table/table_reader.h"
+#include "trace_replay/block_cache_tracer.h"
 
 namespace rocksdb {
 
@@ -32,40 +33,56 @@ struct FileDescriptor;
 class GetContext;
 class HistogramImpl;
 
+// Manages caching for TableReader objects for a column family. The actual
+// cache is allocated separately and passed to the constructor. TableCache
+// wraps around the underlying SST file readers by providing Get(),
+// MultiGet() and NewIterator() methods that hide the instantiation,
+// caching and access to the TableReader. The main purpose of this is
+// performance - by caching the TableReader, it avoids unnecessary file opens
+// and object allocation and instantiation. One exception is compaction, where
+// a new TableReader may be instantiated - see NewIterator() comments
+//
+// Another service provided by TableCache is managing the row cache - if the
+// DB is configured with a row cache, and the lookup key is present in the row
+// cache, lookup is very fast. The row cache is obtained from
+// ioptions.row_cache
 class TableCache {
  public:
   TableCache(const ImmutableCFOptions& ioptions,
-             const EnvOptions& storage_options, Cache* cache);
+             const EnvOptions& storage_options, Cache* cache,
+             BlockCacheTracer* const block_cache_tracer);
   ~TableCache();
 
   // Return an iterator for the specified file number (the corresponding
-  // file length must be exactly "file_size" bytes).  If "tableptr" is
-  // non-nullptr, also sets "*tableptr" to point to the Table object
+  // file length must be exactly "file_size" bytes).  If "table_reader_ptr"
+  // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
   // underlying the returned iterator, or nullptr if no Table object underlies
-  // the returned iterator.  The returned "*tableptr" object is owned by
-  // the cache and should not be deleted, and is valid for as long as the
+  // the returned iterator.  The returned "*table_reader_ptr" object is owned
+  // by the cache and should not be deleted, and is valid for as long as the
   // returned iterator is live.
   // @param range_del_agg If non-nullptr, adds range deletions to the
   //    aggregator. If an error occurs, returns it in a NewErrorInternalIterator
+  // @param for_compaction If true, a new TableReader may be allocated (but
+  //                       not cached), depending on the CF options
   // @param skip_filters Disables loading/accessing the filter block
   // @param level The level this table is at, -1 for "not set / don't know"
   InternalIterator* NewIterator(
       const ReadOptions& options, const EnvOptions& toptions,
       const InternalKeyComparator& internal_comparator,
       const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
-      const SliceTransform* prefix_extractor = nullptr,
-      TableReader** table_reader_ptr = nullptr,
-      HistogramImpl* file_read_hist = nullptr, bool for_compaction = false,
-      Arena* arena = nullptr, bool skip_filters = false, int level = -1,
-      const InternalKey* smallest_compaction_key = nullptr,
-      const InternalKey* largest_compaction_key = nullptr);
+      const SliceTransform* prefix_extractor, TableReader** table_reader_ptr,
+      HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena,
+      bool skip_filters, int level, const InternalKey* smallest_compaction_key,
+      const InternalKey* largest_compaction_key);
 
   // If a seek to internal key "k" in specified file finds an entry,
-  // call (*handle_result)(arg, found_key, found_value) repeatedly until
-  // it returns false.
-  // @param get_context State for get operation. If its range_del_agg() returns
-  //    non-nullptr, adds range deletions to the aggregator. If an error occurs,
-  //    returns non-ok status.
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param get_context Context for get operation. The result of the lookup
+  //                    can be retrieved by calling get_context->State()
+  // @param file_read_hist If non-nullptr, the file reader statistics are
+  //                       recorded
   // @param skip_filters Disables loading/accessing the filter block
   // @param level The level this table is at, -1 for "not set / don't know"
   Status Get(const ReadOptions& options,
@@ -76,6 +93,23 @@ class TableCache {
              HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
              int level = -1);
 
+  // Return the range delete tombstone iterator of the file specified by
+  // `file_meta`.
+  Status GetRangeTombstoneIterator(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param mget_range Pointer to the structure describing a batch of keys to
+  //                   be looked up in this table file. The result is stored
+  //                   in the embedded GetContext
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
   Status MultiGet(const ReadOptions& options,
                   const InternalKeyComparator& internal_comparator,
                   const FileMetaData& file_meta,
@@ -127,6 +161,19 @@ class TableCache {
       const FileDescriptor& fd,
       const SliceTransform* prefix_extractor = nullptr);
 
+  // Returns approximated offset of a key in a file represented by fd.
+  uint64_t ApproximateOffsetOf(
+      const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
+      const InternalKeyComparator& internal_comparator,
+      const SliceTransform* prefix_extractor = nullptr);
+
+  // Returns approximated data size between start and end keys in a file
+  // represented by fd (the start key must not be greater than the end key).
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           const FileDescriptor& fd, TableReaderCaller caller,
+                           const InternalKeyComparator& internal_comparator,
+                           const SliceTransform* prefix_extractor = nullptr);
+
   // Release the handle from a cache
   void ReleaseHandle(Cache::Handle* handle);
 
@@ -149,19 +196,31 @@ class TableCache {
   Status GetTableReader(const EnvOptions& env_options,
                         const InternalKeyComparator& internal_comparator,
                         const FileDescriptor& fd, bool sequential_mode,
-                        size_t readahead, bool record_read_stats,
-                        HistogramImpl* file_read_hist,
+                        bool record_read_stats, HistogramImpl* file_read_hist,
                         std::unique_ptr<TableReader>* table_reader,
                         const SliceTransform* prefix_extractor = nullptr,
                         bool skip_filters = false, int level = -1,
-                        bool prefetch_index_and_filter_in_cache = true,
-                        bool for_compaction = false);
+                        bool prefetch_index_and_filter_in_cache = true);
+
+  // Create a key prefix for looking up the row cache. The prefix is of the
+  // format row_cache_id + fd_number + seq_no. Later, the user key can be
+  // appended to form the full key
+  void CreateRowCacheKeyPrefix(const ReadOptions& options,
+                               const FileDescriptor& fd,
+                               const Slice& internal_key,
+                               GetContext* get_context, IterKey& row_cache_key);
+
+  // Helper function to lookup the row cache for a key. It appends the
+  // user key to row_cache_key at offset prefix_size
+  bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                       size_t prefix_size, GetContext* get_context);
 
   const ImmutableCFOptions& ioptions_;
   const EnvOptions& env_options_;
   Cache* const cache_;
   std::string row_cache_id_;
   bool immortal_tables_;
+  BlockCacheTracer* const block_cache_tracer_;
 };
 
 }  // namespace rocksdb
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index ea561e982ff..e479fa008bf 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -9,19 +9,20 @@
 #include <utility>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
index f92d563eb8e..42c724c0365 100644
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -4,14 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
 
 #include "db/transaction_log_impl.h"
-#include <inttypes.h>
+#include <cinttypes>
 #include "db/write_batch_internal.h"
-#include "util/file_reader_writer.h"
+#include "file/sequence_file_reader.h"
 
 namespace rocksdb {
 
@@ -81,19 +78,16 @@ Status TransactionLogIteratorImpl::status() { return current_status_; }
 
 bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; }
 
-bool TransactionLogIteratorImpl::RestrictedRead(
-    Slice* record,
-    std::string* scratch) {
+bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) {
   // Don't read if no more complete entries to read from logs
   if (current_last_seq_ >= versions_->LastSequence()) {
     return false;
   }
-  return current_log_reader_->ReadRecord(record, scratch);
+  return current_log_reader_->ReadRecord(record, &scratch_);
 }
 
 void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
                                                      bool strict) {
-  std::string scratch;
   Slice record;
   started_ = false;
   is_valid_ = false;
@@ -107,7 +101,7 @@ void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
     reporter_.Info(current_status_.ToString().c_str());
     return;
   }
-  while (RestrictedRead(&record, &scratch)) {
+  while (RestrictedRead(&record)) {
     if (record.size() < WriteBatchInternal::kHeader) {
       reporter_.Corruption(
         record.size(), Status::Corruption("very small log record"));
@@ -158,7 +152,6 @@ void TransactionLogIteratorImpl::Next() {
 }
 
 void TransactionLogIteratorImpl::NextImpl(bool internal) {
-  std::string scratch;
   Slice record;
   is_valid_ = false;
   if (!internal && !started_) {
@@ -170,7 +163,7 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
     if (current_log_reader_->IsEOF()) {
       current_log_reader_->UnmarkEOF();
     }
-    while (RestrictedRead(&record, &scratch)) {
+    while (RestrictedRead(&record)) {
       if (record.size() < WriteBatchInternal::kHeader) {
         reporter_.Corruption(
           record.size(), Status::Corruption("very small log record"));
@@ -202,7 +195,8 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
       if (current_last_seq_ == versions_->LastSequence()) {
         current_status_ = Status::OK();
       } else {
-        current_status_ = Status::Corruption("NO MORE DATA LEFT");
+        const char* msg = "Create a new iterator to fetch the new tail.";
+        current_status_ = Status::TryAgain(msg);
       }
       return;
     }
diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h
index 6382b61a5b7..7d6993d1d0a 100644
--- a/db/transaction_log_impl.h
+++ b/db/transaction_log_impl.h
@@ -9,13 +9,13 @@
 
 #include "db/log_reader.h"
 #include "db/version_set.h"
+#include "file/filename.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/types.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 
@@ -86,6 +86,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   size_t current_file_index_;
   std::unique_ptr<WriteBatch> current_batch_;
   std::unique_ptr<log::Reader> current_log_reader_;
+  std::string scratch_;
   Status OpenLogFile(const LogFile* log_file,
                      std::unique_ptr<SequentialFileReader>* file);
 
@@ -107,7 +108,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   VersionSet const* const versions_;
   const bool seq_per_batch_;
   // Reads from transaction log only if the writebatch record has been written
-  bool RestrictedRead(Slice* record, std::string* scratch);
+  bool RestrictedRead(Slice* record);
   // Seeks to startingSequenceNumber reading from startFileIndex in files_.
   // If strict is set,then must get a batch starting with startingSequenceNumber
   void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false);
diff --git a/db/trim_history_scheduler.cc b/db/trim_history_scheduler.cc
new file mode 100644
index 00000000000..a213ac65f2c
--- /dev/null
+++ b/db/trim_history_scheduler.cc
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/trim_history_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace rocksdb {
+
+void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  cfd->Ref();
+  cfds_.push_back(cfd);
+  is_empty_.store(false, std::memory_order_relaxed);
+}
+
+ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  while (true) {
+    if (cfds_.empty()) {
+      return nullptr;
+    }
+    ColumnFamilyData* cfd = cfds_.back();
+    cfds_.pop_back();
+    if (cfds_.empty()) {
+      is_empty_.store(true, std::memory_order_relaxed);
+    }
+
+    if (!cfd->IsDropped()) {
+      // success
+      return cfd;
+    }
+    if (cfd->Unref()) {
+      // no longer relevant, retry
+      delete cfd;
+    }
+  }
+}
+
+bool TrimHistoryScheduler::Empty() {
+  bool is_empty = is_empty_.load(std::memory_order_relaxed);
+  return is_empty;
+}
+
+void TrimHistoryScheduler::Clear() {
+  ColumnFamilyData* cfd;
+  while ((cfd = TakeNextColumnFamily()) != nullptr) {
+    if (cfd->Unref()) {
+      delete cfd;
+    }
+  }
+  assert(Empty());
+}
+
+}  // namespace rocksdb
diff --git a/db/trim_history_scheduler.h b/db/trim_history_scheduler.h
new file mode 100644
index 00000000000..e9013b96470
--- /dev/null
+++ b/db/trim_history_scheduler.h
@@ -0,0 +1,44 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <mutex>
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+class ColumnFamilyData;
+
+// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps
+// track of column families whose flushed immutable memtables may need to be
+// removed (aka trimmed). The actual trimming may be slightly delayed. Due to
+// the use of the mutex and atomic variable, ScheduleWork,
+// TakeNextColumnFamily, and, Empty can be called concurrently.
+class TrimHistoryScheduler {
+ public:
+  TrimHistoryScheduler() : is_empty_(true) {}
+
+  // When a column family needs history trimming, add cfd to the FIFO queue
+  void ScheduleWork(ColumnFamilyData* cfd);
+
+  // Remove the column family from the queue, the caller is responsible for
+  // calling `MemtableList::TrimHistory`
+  ColumnFamilyData* TakeNextColumnFamily();
+
+  bool Empty();
+
+  void Clear();
+
+  // Not on critical path, use mutex to ensure thread safety
+ private:
+  std::atomic<bool> is_empty_;
+  autovector<ColumnFamilyData*> cfds_;
+  std::mutex checking_mutex_;
+};
+
+}  // namespace rocksdb
diff --git a/db/version_builder.cc b/db/version_builder.cc
index 84e4dc6579a..53e25a446a8 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -9,13 +9,9 @@
 
 #include "db/version_builder.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <functional>
 #include <map>
 #include <set>
@@ -31,6 +27,7 @@
 #include "db/version_set.h"
 #include "port/port.h"
 #include "table/table_reader.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -142,12 +139,12 @@ class VersionBuilder::Rep {
     }
   }
 
-  void CheckConsistency(VersionStorageInfo* vstorage) {
+  Status CheckConsistency(VersionStorageInfo* vstorage) {
 #ifdef NDEBUG
     if (!vstorage->force_consistency_checks()) {
       // Dont run consistency checks in release mode except if
       // explicitly asked to
-      return;
+      return Status::OK();
     }
 #endif
     // make sure the files are sorted correctly
@@ -156,10 +153,14 @@ class VersionBuilder::Rep {
       for (size_t i = 1; i < level_files.size(); i++) {
         auto f1 = level_files[i - 1];
         auto f2 = level_files[i];
+#ifndef NDEBUG
+        auto pair = std::make_pair(&f1, &f2);
+        TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency", &pair);
+#endif
         if (level == 0) {
           if (!level_zero_cmp_(f1, f2)) {
             fprintf(stderr, "L0 files are not sorted properly");
-            abort();
+            return Status::Corruption("L0 files are not sorted properly");
           }
 
           if (f2->fd.smallest_seqno == f2->fd.largest_seqno) {
@@ -172,7 +173,13 @@ class VersionBuilder::Rep {
                       " vs. file with global_seqno %" PRIu64 "\n",
                       f1->fd.smallest_seqno, f1->fd.largest_seqno,
                       external_file_seqno);
-              abort();
+              return Status::Corruption(
+                  "L0 file with seqno " +
+                  NumberToString(f1->fd.smallest_seqno) + " " +
+                  NumberToString(f1->fd.largest_seqno) +
+                  " vs. file with global_seqno" +
+                  NumberToString(external_file_seqno) + " with fileNumber " +
+                  NumberToString(f1->fd.GetNumber()));
             }
           } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) {
             fprintf(stderr,
@@ -180,12 +187,19 @@ class VersionBuilder::Rep {
                     " %" PRIu64 "\n",
                     f1->fd.smallest_seqno, f1->fd.largest_seqno,
                     f2->fd.smallest_seqno, f2->fd.largest_seqno);
-            abort();
+            return Status::Corruption(
+                "L0 files seqno " + NumberToString(f1->fd.smallest_seqno) +
+                " " + NumberToString(f1->fd.largest_seqno) + " " +
+                NumberToString(f1->fd.GetNumber()) + " vs. " +
+                NumberToString(f2->fd.smallest_seqno) + " " +
+                NumberToString(f2->fd.largest_seqno) + " " +
+                NumberToString(f2->fd.GetNumber()));
           }
         } else {
           if (!level_nonzero_cmp_(f1, f2)) {
             fprintf(stderr, "L%d files are not sorted properly", level);
-            abort();
+            return Status::Corruption("L" + NumberToString(level) +
+                                      " files are not sorted properly");
           }
 
           // Make sure there is no overlap in levels > 0
@@ -194,20 +208,24 @@ class VersionBuilder::Rep {
             fprintf(stderr, "L%d have overlapping ranges %s vs. %s\n", level,
                     (f1->largest).DebugString(true).c_str(),
                     (f2->smallest).DebugString(true).c_str());
-            abort();
+            return Status::Corruption(
+                "L" + NumberToString(level) + " have overlapping ranges " +
+                (f1->largest).DebugString(true) + " vs. " +
+                (f2->smallest).DebugString(true));
           }
         }
       }
     }
+    return Status::OK();
   }
 
-  void CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number,
-                                  int level) {
+  Status CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number,
+                                    int level) {
 #ifdef NDEBUG
     if (!base_vstorage_->force_consistency_checks()) {
       // Dont run consistency checks in release mode except if
       // explicitly asked to
-      return;
+      return Status::OK();
     }
 #endif
     // a file to be deleted better exist in the previous version
@@ -245,8 +263,9 @@ class VersionBuilder::Rep {
     }
     if (!found) {
       fprintf(stderr, "not found %" PRIu64 "\n", number);
-      abort();
+      return Status::Corruption("not found " + NumberToString(number));
     }
+    return Status::OK();
   }
 
   bool CheckConsistencyForNumLevels() {
@@ -263,8 +282,11 @@ class VersionBuilder::Rep {
   }
 
   // Apply all of the edits in *edit to the current state.
-  void Apply(VersionEdit* edit) {
-    CheckConsistency(base_vstorage_);
+  Status Apply(VersionEdit* edit) {
+    Status s = CheckConsistency(base_vstorage_);
+    if (!s.ok()) {
+      return s;
+    }
 
     // Delete files
     const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles();
@@ -281,10 +303,7 @@ class VersionBuilder::Rep {
           levels_[level].added_files.erase(exising);
         }
       } else {
-        auto exising = invalid_levels_[level].find(number);
-        if (exising != invalid_levels_[level].end()) {
-          invalid_levels_[level].erase(exising);
-        } else {
+        if (invalid_levels_[level].erase(number) == 0) {
           // Deleting an non-existing file on invalid level.
           has_invalid_levels_ = true;
         }
@@ -304,20 +323,29 @@ class VersionBuilder::Rep {
         levels_[level].added_files[f->fd.GetNumber()] = f;
       } else {
         uint64_t number = new_file.second.fd.GetNumber();
-        if (invalid_levels_[level].count(number) == 0) {
-          invalid_levels_[level].insert(number);
+        auto& lvls = invalid_levels_[level];
+        if (lvls.count(number) == 0) {
+          lvls.insert(number);
         } else {
           // Creating an already existing file on invalid level.
           has_invalid_levels_ = true;
         }
       }
     }
+    return s;
   }
 
   // Save the current state in *v.
-  void SaveTo(VersionStorageInfo* vstorage) {
-    CheckConsistency(base_vstorage_);
-    CheckConsistency(vstorage);
+  Status SaveTo(VersionStorageInfo* vstorage) {
+    Status s = CheckConsistency(base_vstorage_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    s = CheckConsistency(vstorage);
+    if (!s.ok()) {
+      return s;
+    }
 
     for (int level = 0; level < num_levels_; level++) {
       const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_;
@@ -361,7 +389,8 @@ class VersionBuilder::Rep {
       }
     }
 
-    CheckConsistency(vstorage);
+    s = CheckConsistency(vstorage);
+    return s;
   }
 
   Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
@@ -479,23 +508,23 @@ VersionBuilder::VersionBuilder(const EnvOptions& env_options,
 
 VersionBuilder::~VersionBuilder() { delete rep_; }
 
-void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
-  rep_->CheckConsistency(vstorage);
+Status VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
+  return rep_->CheckConsistency(vstorage);
 }
 
-void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
-                                                uint64_t number, int level) {
-  rep_->CheckConsistencyForDeletes(edit, number, level);
+Status VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit,
+                                                  uint64_t number, int level) {
+  return rep_->CheckConsistencyForDeletes(edit, number, level);
 }
 
 bool VersionBuilder::CheckConsistencyForNumLevels() {
   return rep_->CheckConsistencyForNumLevels();
 }
 
-void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
+Status VersionBuilder::Apply(VersionEdit* edit) { return rep_->Apply(edit); }
 
-void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
-  rep_->SaveTo(vstorage);
+Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
+  return rep_->SaveTo(vstorage);
 }
 
 Status VersionBuilder::LoadTableHandlers(
diff --git a/db/version_builder.h b/db/version_builder.h
index 168301fdd61..f5fd121897b 100644
--- a/db/version_builder.h
+++ b/db/version_builder.h
@@ -27,12 +27,12 @@ class VersionBuilder {
   VersionBuilder(const EnvOptions& env_options, TableCache* table_cache,
                  VersionStorageInfo* base_vstorage, Logger* info_log = nullptr);
   ~VersionBuilder();
-  void CheckConsistency(VersionStorageInfo* vstorage);
-  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
-                                  int level);
+  Status CheckConsistency(VersionStorageInfo* vstorage);
+  Status CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+                                    int level);
   bool CheckConsistencyForNumLevels();
-  void Apply(VersionEdit* edit);
-  void SaveTo(VersionStorageInfo* vstorage);
+  Status Apply(VersionEdit* edit);
+  Status SaveTo(VersionStorageInfo* vstorage);
   Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
                            bool prefetch_index_and_filter_in_cache,
                            bool is_initial_load,
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 514952bb5b1..64d2d2481eb 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -6,10 +6,10 @@
 #include <string>
 #include "db/version_edit.h"
 #include "db/version_set.h"
-#include "util/logging.h"
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -59,14 +59,12 @@ class VersionBuilderTest : public testing::Test {
            bool sampled = false, SequenceNumber smallest_seqno = 0,
            SequenceNumber largest_seqno = 0) {
     assert(level < vstorage_.num_levels());
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(file_number, path_id, file_size);
-    f->smallest = GetInternalKey(smallest, smallest_seq);
-    f->largest = GetInternalKey(largest, largest_seq);
-    f->fd.smallest_seqno = smallest_seqno;
-    f->fd.largest_seqno = largest_seqno;
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
+        GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
+        /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime);
     f->compensated_file_size = file_size;
-    f->refs = 0;
     f->num_entries = num_entries;
     f->num_deletions = num_deletions;
     vstorage_.AddFile(level, f);
@@ -115,7 +113,9 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.DeleteFile(3, 27U);
 
   EnvOptions env_options;
@@ -149,7 +149,9 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
 
   VersionEdit version_edit;
   version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
 
@@ -186,7 +188,9 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
 
   VersionEdit version_edit;
   version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
   version_edit.DeleteFile(4, 6U);
@@ -214,15 +218,25 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
-                       GetInternalKey("450"), 200, 200, false);
+                       GetInternalKey("450"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
-                       GetInternalKey("650"), 200, 200, false);
+                       GetInternalKey("650"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
-                       GetInternalKey("550"), 200, 200, false);
+                       GetInternalKey("550"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
-                       GetInternalKey("750"), 200, 200, false);
+                       GetInternalKey("750"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
 
   EnvOptions env_options;
 
@@ -248,24 +262,38 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200, false);
+                       GetInternalKey("350"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
-                       GetInternalKey("450"), 200, 200, false);
+                       GetInternalKey("450"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
-                       GetInternalKey("650"), 200, 200, false);
+                       GetInternalKey("650"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
-                       GetInternalKey("550"), 200, 200, false);
+                       GetInternalKey("550"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
-                       GetInternalKey("750"), 200, 200, false);
+                       GetInternalKey("750"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_builder.Apply(&version_edit);
 
   VersionEdit version_edit2;
   version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
-                       GetInternalKey("950"), 200, 200, false);
+                       GetInternalKey("950"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_edit2.DeleteFile(2, 616);
   version_edit2.DeleteFile(2, 636);
   version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
-                       GetInternalKey("850"), 200, 200, false);
+                       GetInternalKey("850"), 200, 200, false,
+                       kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                       kUnknownFileCreationTime);
   version_builder.Apply(&version_edit2);
 
   version_builder.SaveTo(&new_vstorage);
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 01ec44515a7..dc1d821d975 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -9,17 +9,23 @@
 
 #include "db/version_edit.h"
 
+#include "db/blob_index.h"
 #include "db/version_set.h"
+#include "logging/event_logger.h"
 #include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/event_logger.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
+// Mask for an identified tag from the future which can be safely ignored.
+const uint32_t kTagSafeIgnoreMask = 1 << 13;
+
 // Tag numbers for serialized VersionEdit.  These numbers are written to
-// disk and should not be changed.
+// disk and should not be changed. The number should be forward compatible so
+// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
+// between Tag and kTagSafeIgnoreMask field.
 enum Tag : uint32_t {
   kComparator = 1,
   kLogNumber = 2,
@@ -31,6 +37,8 @@ enum Tag : uint32_t {
   // 8 was used for large value refs
   kPrevLogNumber = 9,
   kMinLogNumberToKeep = 10,
+  // Ignore-able field
+  kDbId = kTagSafeIgnoreMask + 1,
 
   // these are new formats divergent from open source leveldb
   kNewFile2 = 100,
@@ -44,9 +52,6 @@ enum Tag : uint32_t {
   kInAtomicGroup = 300,
 };
 
-// Mask for an identified tag from the future which can be safely ignored.
-uint32_t kTagSafeIgnoreMask = 1 << 13;
-
 enum CustomTag : uint32_t {
   kTerminate = 1,  // The end of customized fields
   kNeedCompaction = 2,
@@ -55,6 +60,9 @@ enum CustomTag : uint32_t {
   // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
   // removed when manifest becomes forward-comptabile.
   kMinLogNumberToKeepHack = 3,
+  kOldestBlobFileNumber = 4,
+  kOldestAncesterTime = 5,
+  kFileCreationTime = 6,
   kPathId = 65,
 };
 // If this bit for the custom tag is set, opening DB should fail if
@@ -66,7 +74,51 @@ uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
   return number | (path_id * (kFileNumberMask + 1));
 }
 
+void FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
+                                    SequenceNumber seqno,
+                                    ValueType value_type) {
+  if (smallest.size() == 0) {
+    smallest.DecodeFrom(key);
+  }
+  largest.DecodeFrom(key);
+  fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+  fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+
+#ifndef ROCKSDB_LITE
+  if (value_type == kTypeBlobIndex) {
+    BlobIndex blob_index;
+    const Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      return;
+    }
+
+    if (blob_index.IsInlined()) {
+      return;
+    }
+
+    if (blob_index.HasTTL()) {
+      return;
+    }
+
+    // Paranoid check: this should not happen because BlobDB numbers the blob
+    // files starting from 1.
+    if (blob_index.file_number() == kInvalidBlobFileNumber) {
+      return;
+    }
+
+    if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+        oldest_blob_file_number > blob_index.file_number()) {
+      oldest_blob_file_number = blob_index.file_number();
+    }
+  }
+#else
+  (void)value;
+  (void)value_type;
+#endif
+}
+
 void VersionEdit::Clear() {
+  db_id_.clear();
   comparator_.clear();
   max_level_ = 0;
   log_number_ = 0;
@@ -75,6 +127,7 @@ void VersionEdit::Clear() {
   next_file_number_ = 0;
   max_column_family_ = 0;
   min_log_number_to_keep_ = 0;
+  has_db_id_ = false;
   has_comparator_ = false;
   has_log_number_ = false;
   has_prev_log_number_ = false;
@@ -93,6 +146,10 @@ void VersionEdit::Clear() {
 }
 
 bool VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_db_id_) {
+    PutVarint32(dst, kDbId);
+    PutLengthPrefixedSlice(dst, db_id_);
+  }
   if (has_comparator_) {
     PutVarint32(dst, kComparator);
     PutLengthPrefixedSlice(dst, comparator_);
@@ -123,75 +180,79 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
     if (!f.smallest.Valid() || !f.largest.Valid()) {
       return false;
     }
-    bool has_customized_fields = false;
-    if (f.marked_for_compaction || has_min_log_number_to_keep_) {
-      PutVarint32(dst, kNewFile4);
-      has_customized_fields = true;
-    } else if (f.fd.GetPathId() == 0) {
-      // Use older format to make sure user can roll back the build if they
-      // don't config multiple DB paths.
-      PutVarint32(dst, kNewFile2);
-    } else {
-      PutVarint32(dst, kNewFile3);
-    }
+    PutVarint32(dst, kNewFile4);
     PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
-    if (f.fd.GetPathId() != 0 && !has_customized_fields) {
-      // kNewFile3
-      PutVarint32(dst, f.fd.GetPathId());
-    }
     PutVarint64(dst, f.fd.GetFileSize());
     PutLengthPrefixedSlice(dst, f.smallest.Encode());
     PutLengthPrefixedSlice(dst, f.largest.Encode());
     PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
-    if (has_customized_fields) {
-      // Customized fields' format:
-      // +-----------------------------+
-      // | 1st field's tag (varint32)  |
-      // +-----------------------------+
-      // | 1st field's size (varint32) |
-      // +-----------------------------+
-      // |    bytes for 1st field      |
-      // |  (based on size decoded)    |
-      // +-----------------------------+
-      // |                             |
-      // |          ......             |
-      // |                             |
-      // +-----------------------------+
-      // | last field's size (varint32)|
-      // +-----------------------------+
-      // |    bytes for last field     |
-      // |  (based on size decoded)    |
-      // +-----------------------------+
-      // | terminating tag (varint32)  |
-      // +-----------------------------+
-      //
-      // Customized encoding for fields:
-      //   tag kPathId: 1 byte as path_id
-      //   tag kNeedCompaction:
-      //        now only can take one char value 1 indicating need-compaction
-      //
-      if (f.fd.GetPathId() != 0) {
-        PutVarint32(dst, CustomTag::kPathId);
-        char p = static_cast<char>(f.fd.GetPathId());
-        PutLengthPrefixedSlice(dst, Slice(&p, 1));
-      }
-      if (f.marked_for_compaction) {
-        PutVarint32(dst, CustomTag::kNeedCompaction);
-        char p = static_cast<char>(1);
-        PutLengthPrefixedSlice(dst, Slice(&p, 1));
-      }
-      if (has_min_log_number_to_keep_ && !min_log_num_written) {
-        PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack);
-        std::string varint_log_number;
-        PutFixed64(&varint_log_number, min_log_number_to_keep_);
-        PutLengthPrefixedSlice(dst, Slice(varint_log_number));
-        min_log_num_written = true;
-      }
-      TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
-                               dst);
-
-      PutVarint32(dst, CustomTag::kTerminate);
+    // Customized fields' format:
+    // +-----------------------------+
+    // | 1st field's tag (varint32)  |
+    // +-----------------------------+
+    // | 1st field's size (varint32) |
+    // +-----------------------------+
+    // |    bytes for 1st field      |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // |                             |
+    // |          ......             |
+    // |                             |
+    // +-----------------------------+
+    // | last field's size (varint32)|
+    // +-----------------------------+
+    // |    bytes for last field     |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // | terminating tag (varint32)  |
+    // +-----------------------------+
+    //
+    // Customized encoding for fields:
+    //   tag kPathId: 1 byte as path_id
+    //   tag kNeedCompaction:
+    //        now only can take one char value 1 indicating need-compaction
+    //
+    PutVarint32(dst, CustomTag::kOldestAncesterTime);
+    std::string varint_oldest_ancester_time;
+    PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
+                             &varint_oldest_ancester_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+
+    PutVarint32(dst, CustomTag::kFileCreationTime);
+    std::string varint_file_creation_time;
+    PutVarint64(&varint_file_creation_time, f.file_creation_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
+                             &varint_file_creation_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+
+    if (f.fd.GetPathId() != 0) {
+      PutVarint32(dst, CustomTag::kPathId);
+      char p = static_cast<char>(f.fd.GetPathId());
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (f.marked_for_compaction) {
+      PutVarint32(dst, CustomTag::kNeedCompaction);
+      char p = static_cast<char>(1);
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
     }
+    if (has_min_log_number_to_keep_ && !min_log_num_written) {
+      PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack);
+      std::string varint_log_number;
+      PutFixed64(&varint_log_number, min_log_number_to_keep_);
+      PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+      min_log_num_written = true;
+    }
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      PutVarint32(dst, CustomTag::kOldestBlobFileNumber);
+      std::string oldest_blob_file_number;
+      PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+      PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+    }
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
+                             dst);
+
+    PutVarint32(dst, CustomTag::kTerminate);
   }
 
   // 0 is default and does not need to be explicitly written
@@ -278,6 +339,16 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
             return "path_id wrong vaue";
           }
           break;
+        case kOldestAncesterTime:
+          if (!GetVarint64(&field, &f.oldest_ancester_time)) {
+            return "invalid oldest ancester time";
+          }
+          break;
+        case kFileCreationTime:
+          if (!GetVarint64(&field, &f.file_creation_time)) {
+            return "invalid file creation time";
+          }
+          break;
         case kNeedCompaction:
           if (field.size() != 1) {
             return "need_compaction field wrong size";
@@ -292,6 +363,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
           }
           has_min_log_number_to_keep_ = true;
           break;
+        case kOldestBlobFileNumber:
+          if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
+            return "invalid oldest blob file number";
+          }
+          break;
         default:
           if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
             // Should not proceed if cannot understand it
@@ -320,9 +396,16 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
   FileMetaData f;
   Slice str;
   InternalKey key;
-
   while (msg == nullptr && GetVarint32(&input, &tag)) {
     switch (tag) {
+      case kDbId:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          db_id_ = str.ToString();
+          has_db_id_ = true;
+        } else {
+          msg = "db id";
+        }
+        break;
       case kComparator:
         if (GetLengthPrefixedSlice(&input, &str)) {
           comparator_ = str.ToString();
@@ -537,6 +620,10 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
 std::string VersionEdit::DebugString(bool hex_key) const {
   std::string r;
   r.append("VersionEdit {");
+  if (has_db_id_) {
+    r.append("\n  DB ID: ");
+    r.append(db_id_);
+  }
   if (has_comparator_) {
     r.append("\n  Comparator: ");
     r.append(comparator_);
@@ -581,6 +668,14 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append(f.smallest.DebugString(hex_key));
     r.append(" .. ");
     r.append(f.largest.DebugString(hex_key));
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      r.append(" blob_file:");
+      AppendNumberTo(&r, f.oldest_blob_file_number);
+    }
+    r.append(" oldest_ancester_time:");
+    AppendNumberTo(&r, f.oldest_ancester_time);
+    r.append(" file_creation_time:");
+    AppendNumberTo(&r, f.file_creation_time);
   }
   r.append("\n  ColumnFamily: ");
   AppendNumberTo(&r, column_family_);
@@ -608,6 +703,9 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
   JSONWriter jw;
   jw << "EditNumber" << edit_num;
 
+  if (has_db_id_) {
+    jw << "DB ID" << db_id_;
+  }
   if (has_comparator_) {
     jw << "Comparator" << comparator_;
   }
@@ -652,6 +750,9 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
       jw << "FileSize" << f.fd.GetFileSize();
       jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
       jw << "LargestIKey" << f.largest.DebugString(hex_key);
+      if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+        jw << "OldestBlobFile" << f.oldest_blob_file_number;
+      }
       jw.EndArrayedObject();
     }
 
diff --git a/db/version_edit.h b/db/version_edit.h
index ee6499cdc3b..5815d18dca6 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -10,19 +10,23 @@
 #pragma once
 #include <algorithm>
 #include <set>
+#include <string>
 #include <utility>
 #include <vector>
-#include <string>
-#include "rocksdb/cache.h"
 #include "db/dbformat.h"
-#include "util/arena.h"
+#include "memory/arena.h"
+#include "rocksdb/cache.h"
+#include "table/table_reader.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
 
 class VersionSet;
 
-const uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+constexpr uint64_t kInvalidBlobFileNumber = 0;
+constexpr uint64_t kUnknownOldestAncesterTime = 0;
+constexpr uint64_t kUnknownFileCreationTime = 0;
 
 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
 
@@ -52,6 +56,8 @@ struct FileDescriptor {
         smallest_seqno(_smallest_seqno),
         largest_seqno(_largest_seqno) {}
 
+  FileDescriptor(const FileDescriptor& fd) { *this = fd; }
+
   FileDescriptor& operator=(const FileDescriptor& fd) {
     table_reader = fd.table_reader;
     packed_number_and_path_id = fd.packed_number_and_path_id;
@@ -89,7 +95,7 @@ struct FileMetaData {
   InternalKey largest;             // Largest internal key served by table
 
   // Needs to be disposed when refs becomes 0.
-  Cache::Handle* table_reader_handle;
+  Cache::Handle* table_reader_handle = nullptr;
 
   FileSampledStats stats;
 
@@ -98,45 +104,58 @@ struct FileMetaData {
   // File size compensated by deletion entry.
   // This is updated in Version::UpdateAccumulatedStats() first time when the
   // file is created or loaded.  After it is updated (!= 0), it is immutable.
-  uint64_t compensated_file_size;
+  uint64_t compensated_file_size = 0;
   // These values can mutate, but they can only be read or written from
   // single-threaded LogAndApply thread
-  uint64_t num_entries;            // the number of entries.
-  uint64_t num_deletions;          // the number of deletion entries.
-  uint64_t raw_key_size;           // total uncompressed key size.
-  uint64_t raw_value_size;         // total uncompressed value size.
-
-  int refs;  // Reference count
-
-  bool being_compacted;        // Is this file undergoing compaction?
-  bool init_stats_from_file;   // true if the data-entry stats of this file
-                               // has initialized from file.
-
-  bool marked_for_compaction;  // True if client asked us nicely to compact this
-                               // file.
-
-  FileMetaData()
-      : table_reader_handle(nullptr),
-        compensated_file_size(0),
-        num_entries(0),
-        num_deletions(0),
-        raw_key_size(0),
-        raw_value_size(0),
-        refs(0),
-        being_compacted(false),
-        init_stats_from_file(false),
-        marked_for_compaction(false) {}
+  uint64_t num_entries = 0;     // the number of entries.
+  uint64_t num_deletions = 0;   // the number of deletion entries.
+  uint64_t raw_key_size = 0;    // total uncompressed key size.
+  uint64_t raw_value_size = 0;  // total uncompressed value size.
+
+  int refs = 0;  // Reference count
+
+  bool being_compacted = false;       // Is this file undergoing compaction?
+  bool init_stats_from_file = false;  // true if the data-entry stats of this
+                                      // file has initialized from file.
+
+  bool marked_for_compaction = false;  // True if client asked us nicely to
+                                       // compact this file.
+
+  // Used only in BlobDB. The file number of the oldest blob file this SST file
+  // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+
+  // The file could be the compaction output from other SST files, which could
+  // in turn be outputs for compact older SST files. We track the memtable
+  // flush timestamp for the oldest SST file that eventaully contribute data
+  // to this file. 0 means the information is not available.
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+
+  // Unix time when the SST file is created.
+  uint64_t file_creation_time = kUnknownFileCreationTime;
+
+  FileMetaData() = default;
+
+  FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
+               const InternalKey& smallest_key, const InternalKey& largest_key,
+               const SequenceNumber& smallest_seq,
+               const SequenceNumber& largest_seq, bool marked_for_compact,
+               uint64_t oldest_blob_file, uint64_t _oldest_ancester_time,
+               uint64_t _file_creation_time)
+      : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
+        smallest(smallest_key),
+        largest(largest_key),
+        marked_for_compaction(marked_for_compact),
+        oldest_blob_file_number(oldest_blob_file),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time) {
+    TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
+  }
 
   // REQUIRED: Keys must be given to the function in sorted order (it expects
   // the last key to be the largest).
-  void UpdateBoundaries(const Slice& key, SequenceNumber seqno) {
-    if (smallest.size() == 0) {
-      smallest.DecodeFrom(key);
-    }
-    largest.DecodeFrom(key);
-    fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
-    fd.largest_seqno = std::max(fd.largest_seqno, seqno);
-  }
+  void UpdateBoundaries(const Slice& key, const Slice& value,
+                        SequenceNumber seqno, ValueType value_type);
 
   // Unlike UpdateBoundaries, ranges do not need to be presented in any
   // particular order.
@@ -152,6 +171,29 @@ struct FileMetaData {
     fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
     fd.largest_seqno = std::max(fd.largest_seqno, seqno);
   }
+
+  // Try to get oldest ancester time from the class itself or table properties
+  // if table reader is already pinned.
+  // 0 means the information is not available.
+  uint64_t TryGetOldestAncesterTime() {
+    if (oldest_ancester_time != kUnknownOldestAncesterTime) {
+      return oldest_ancester_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->creation_time;
+    }
+    return kUnknownOldestAncesterTime;
+  }
+
+  uint64_t TryGetFileCreationTime() {
+    if (file_creation_time != kUnknownFileCreationTime) {
+      return file_creation_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->file_creation_time;
+    }
+    return kUnknownFileCreationTime;
+  }
 };
 
 // A compressed copy of file meta data that just contain minimum data needed
@@ -189,6 +231,10 @@ struct LevelFilesBrief {
   }
 };
 
+// The state of a DB at any given time is referred to as a Version.
+// Any modification to the Version is considered a Version Edit. A Version is
+// constructed by joining a sequence of Version Edits. Version Edits are written
+// to the MANIFEST file.
 class VersionEdit {
  public:
   VersionEdit() { Clear(); }
@@ -196,6 +242,11 @@ class VersionEdit {
 
   void Clear();
 
+  void SetDBId(const std::string& db_id) {
+    has_db_id_ = true;
+    db_id_ = db_id;
+  }
+
   void SetComparatorName(const Slice& name) {
     has_comparator_ = true;
     comparator_ = name.ToString();
@@ -225,6 +276,8 @@ class VersionEdit {
     min_log_number_to_keep_ = num;
   }
 
+  bool has_db_id() { return has_db_id_; }
+
   bool has_log_number() { return has_log_number_; }
 
   uint64_t log_number() { return log_number_; }
@@ -236,21 +289,20 @@ class VersionEdit {
   // Add the specified file at the specified number.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
   // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
+  // referred to by this file if any, kInvalidBlobFileNumber otherwise.
   void AddFile(int level, uint64_t file, uint32_t file_path_id,
                uint64_t file_size, const InternalKey& smallest,
                const InternalKey& largest, const SequenceNumber& smallest_seqno,
-               const SequenceNumber& largest_seqno,
-               bool marked_for_compaction) {
+               const SequenceNumber& largest_seqno, bool marked_for_compaction,
+               uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time,
+               uint64_t file_creation_time) {
     assert(smallest_seqno <= largest_seqno);
-    FileMetaData f;
-    f.fd = FileDescriptor(file, file_path_id, file_size, smallest_seqno,
-                          largest_seqno);
-    f.smallest = smallest;
-    f.largest = largest;
-    f.fd.smallest_seqno = smallest_seqno;
-    f.fd.largest_seqno = largest_seqno;
-    f.marked_for_compaction = marked_for_compaction;
-    new_files_.emplace_back(level, std::move(f));
+    new_files_.emplace_back(
+        level, FileMetaData(file, file_path_id, file_size, smallest, largest,
+                            smallest_seqno, largest_seqno,
+                            marked_for_compaction, oldest_blob_file_number,
+                            oldest_ancester_time, file_creation_time));
   }
 
   void AddFile(int level, const FileMetaData& f) {
@@ -312,14 +364,18 @@ class VersionEdit {
   std::string DebugString(bool hex_key = false) const;
   std::string DebugJSON(int edit_num, bool hex_key = false) const;
 
+  const std::string GetDbId() { return db_id_; }
+
  private:
   friend class ReactiveVersionSet;
   friend class VersionSet;
   friend class Version;
+  friend class AtomicGroupReadBuffer;
 
   bool GetLevel(Slice* input, int* level, const char** msg);
 
   int max_level_;
+  std::string db_id_;
   std::string comparator_;
   uint64_t log_number_;
   uint64_t prev_log_number_;
@@ -328,6 +384,7 @@ class VersionEdit {
   // The most recent WAL log number that is deleted
   uint64_t min_log_number_to_keep_;
   SequenceNumber last_sequence_;
+  bool has_db_id_;
   bool has_comparator_;
   bool has_log_number_;
   bool has_prev_log_number_;
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 64d1fd77bc1..8a4c1380c1e 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -8,9 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_edit.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 #include "util/coding.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
@@ -36,7 +36,8 @@ TEST_F(VersionEditTest, EncodeDecode) {
     edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
                  InternalKey("foo", kBig + 500 + i, kTypeValue),
                  InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
-                 kBig + 500 + i, kBig + 600 + i, false);
+                 kBig + 500 + i, kBig + 600 + i, false, kInvalidBlobFileNumber,
+                 888, 678);
     edit.DeleteFile(4, kBig + 700 + i);
   }
 
@@ -53,13 +54,20 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true);
+               kBig + 600, true, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
-               kBig + 601, false);
+               kBig + 601, false, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime);
   edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
                InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
-               kBig + 602, true);
+               kBig + 602, true, kInvalidBlobFileNumber, 666, 888);
+  edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
+               InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
+               kBig + 603, true, 1001, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime);
+  ;
 
   edit.DeleteFile(4, 700);
 
@@ -78,9 +86,18 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
   ASSERT_TRUE(new_files[0].second.marked_for_compaction);
   ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
   ASSERT_TRUE(new_files[2].second.marked_for_compaction);
-  ASSERT_EQ(3, new_files[0].second.fd.GetPathId());
-  ASSERT_EQ(3, new_files[1].second.fd.GetPathId());
-  ASSERT_EQ(0, new_files[2].second.fd.GetPathId());
+  ASSERT_TRUE(new_files[3].second.marked_for_compaction);
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[2].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[3].second.fd.GetPathId());
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[0].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[1].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[2].second.oldest_blob_file_number);
+  ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number);
 }
 
 TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
@@ -88,10 +105,11 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true);
+               kBig + 600, true, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime);
   edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
                InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
-               kBig + 601, false);
+               kBig + 601, false, kInvalidBlobFileNumber, 686, 868);
   edit.DeleteFile(4, 700);
 
   edit.SetComparatorName("foo");
@@ -127,8 +145,8 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
   auto& new_files = parsed.GetNewFiles();
   ASSERT_TRUE(new_files[0].second.marked_for_compaction);
   ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
-  ASSERT_EQ(3, new_files[0].second.fd.GetPathId());
-  ASSERT_EQ(3, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
   ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
 }
 
@@ -137,7 +155,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
   VersionEdit edit;
   edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
                InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
-               kBig + 600, true);
+               kBig + 600, true, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime);
 
   edit.SetComparatorName("foo");
   edit.SetLogNumber(kBig + 100);
@@ -164,7 +183,9 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) {
 
 TEST_F(VersionEditTest, EncodeEmptyFile) {
   VersionEdit edit;
-  edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false);
+  edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
+               kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime);
   std::string buffer;
   ASSERT_TRUE(!edit.EncodeTo(&buffer));
 }
@@ -239,6 +260,16 @@ TEST_F(VersionEditTest, IgnorableField) {
   ASSERT_EQ(88, ve.next_file_number());
 }
 
+TEST_F(VersionEditTest, DbId) {
+  VersionEdit edit;
+  edit.SetDBId("ab34-cd12-435f-er00");
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetDBId("34ba-cd12-435f-er01");
+  TestEncodeDecode(edit);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index fdc07fee0e5..444996e409d 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -9,21 +9,17 @@
 
 #include "db/version_set.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <stdio.h>
 #include <algorithm>
 #include <array>
+#include <cinttypes>
 #include <list>
 #include <map>
 #include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "db/compaction.h"
+#include "compaction/compaction.h"
 #include "db/internal_stats.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -33,8 +29,13 @@
 #include "db/pinned_iterators_manager.h"
 #include "db/table_cache.h"
 #include "db/version_builder.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
 #include "monitoring/file_read_sample.h"
 #include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/write_buffer_manager.h"
@@ -44,15 +45,13 @@
 #include "table/merging_iterator.h"
 #include "table/meta_blocks.h"
 #include "table/multiget_context.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
-#include "util/filename.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 #include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
@@ -93,7 +92,8 @@ Status OverlapWithIterator(const Comparator* ucmp,
       return Status::Corruption("DB have corrupted keys");
     }
 
-    if (ucmp->Compare(seek_result.user_key, largest_user_key) <= 0) {
+    if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
+        0) {
       *overlap = true;
     }
   }
@@ -171,17 +171,16 @@ class FilePicker {
           // Check if key is within a file's range. If search left bound and
           // right bound point to the same find, we are sure key falls in
           // range.
-          assert(
-              curr_level_ == 0 ||
-              curr_index_in_curr_level_ == start_index_in_curr_level_ ||
-              user_comparator_->Compare(user_key_,
-                ExtractUserKey(f->smallest_key)) <= 0);
-
-          int cmp_smallest = user_comparator_->Compare(user_key_,
-              ExtractUserKey(f->smallest_key));
+          assert(curr_level_ == 0 ||
+                 curr_index_in_curr_level_ == start_index_in_curr_level_ ||
+                 user_comparator_->CompareWithoutTimestamp(
+                     user_key_, ExtractUserKey(f->smallest_key)) <= 0);
+
+          int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+              user_key_, ExtractUserKey(f->smallest_key));
           if (cmp_smallest >= 0) {
-            cmp_largest = user_comparator_->Compare(user_key_,
-                ExtractUserKey(f->largest_key));
+            cmp_largest = user_comparator_->CompareWithoutTimestamp(
+                user_key_, ExtractUserKey(f->largest_key));
           }
 
           // Setup file search bound for the next level based on the
@@ -353,7 +352,7 @@ class FilePickerMultiGet {
   struct FilePickerContext;
 
  public:
-  FilePickerMultiGet(std::vector<FileMetaData*>* files, MultiGetRange* range,
+  FilePickerMultiGet(MultiGetRange* range,
                      autovector<LevelFilesBrief>* file_levels,
                      unsigned int num_levels, FileIndexer* file_indexer,
                      const Comparator* user_comparator,
@@ -368,18 +367,12 @@ class FilePickerMultiGet {
         maybe_repeat_key_(false),
         current_level_range_(*range, range->begin(), range->end()),
         current_file_range_(*range, range->begin(), range->end()),
-#ifndef NDEBUG
-        files_(files),
-#endif
         level_files_brief_(file_levels),
         is_hit_file_last_in_level_(false),
         curr_file_level_(nullptr),
         file_indexer_(file_indexer),
         user_comparator_(user_comparator),
         internal_comparator_(internal_comparator) {
-#ifdef NDEBUG
-    (void)files;
-#endif
     for (auto iter = range_->begin(); iter != range_->end(); ++iter) {
       fp_ctx_array_[iter.index()] =
           FilePickerContext(0, FileIndexer::kLevelMaxIndex);
@@ -416,6 +409,18 @@ class FilePickerMultiGet {
     bool file_hit = false;
     int cmp_largest = -1;
     if (curr_file_index >= curr_file_level_->num_files) {
+      // In the unlikely case the next key is a duplicate of the current key,
+      // and the current key is the last in the level and the internal key
+      // was not found, we need to skip lookup for the remaining keys and
+      // reset the search bounds
+      if (batch_iter_ != current_level_range_.end()) {
+        ++batch_iter_;
+        for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) {
+          struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+          fp_ctx.search_left_bound = 0;
+          fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+        }
+      }
       return false;
     }
     // Loops over keys in the MultiGet batch until it finds a file with
@@ -473,25 +478,6 @@ class FilePickerMultiGet {
       } else {
         file_hit = true;
       }
-#ifndef NDEBUG
-      // Sanity check to make sure that the files are correctly sorted
-      if (f != prev_file_) {
-        if (prev_file_) {
-          if (curr_level_ != 0) {
-            int comp_sign = internal_comparator_->Compare(
-                prev_file_->largest_key, f->smallest_key);
-            assert(comp_sign < 0);
-          } else if (fp_ctx.curr_index_in_curr_level > 0) {
-            // level == 0, the current file cannot be newer than the previous
-            // one. Use compressed data structure, has no attribute seqNo
-            assert(!NewestFirstBySeqNo(
-                files_[0][fp_ctx.curr_index_in_curr_level],
-                files_[0][fp_ctx.curr_index_in_curr_level - 1]));
-          }
-        }
-        prev_file_ = f;
-      }
-#endif
       if (cmp_largest == 0) {
         // cmp_largest is 0, which means the next key will not be in this
         // file, so stop looking further. Also don't increment megt_iter_
@@ -533,7 +519,10 @@ class FilePickerMultiGet {
           // any further for that key, so advance batch_iter_. Else, keep
           // batch_iter_ positioned on that key so we look it up again in
           // the next file
-          if (current_level_range_.CheckKeyDone(batch_iter_)) {
+          // For L0, always advance the key because we will look in the next
+          // file regardless for all keys not found yet
+          if (current_level_range_.CheckKeyDone(batch_iter_) ||
+              curr_level_ == 0) {
             ++batch_iter_;
           }
         }
@@ -601,7 +590,8 @@ class FilePickerMultiGet {
     unsigned int start_index_in_curr_level;
 
     FilePickerContext(int32_t left, int32_t right)
-        : search_left_bound(left), search_right_bound(right) {}
+        : search_left_bound(left), search_right_bound(right),
+          curr_index_in_curr_level(0), start_index_in_curr_level(0) {}
 
     FilePickerContext() = default;
   };
@@ -619,9 +609,6 @@ class FilePickerMultiGet {
   bool maybe_repeat_key_;
   MultiGetRange current_level_range_;
   MultiGetRange current_file_range_;
-#ifndef NDEBUG
-  std::vector<FileMetaData*>* files_;
-#endif
   autovector<LevelFilesBrief>* level_files_brief_;
   bool search_ended_;
   bool is_hit_file_last_in_level_;
@@ -629,9 +616,6 @@ class FilePickerMultiGet {
   FileIndexer* file_indexer_;
   const Comparator* user_comparator_;
   const InternalKeyComparator* internal_comparator_;
-#ifndef NDEBUG
-  FdWithKeyRange* prev_file_;
-#endif
 
   // Setup local variables to search next level.
   // Returns false if there are no more levels to search.
@@ -640,9 +624,6 @@ class FilePickerMultiGet {
       MultiGetRange::Iterator mget_iter = current_level_range_.begin();
       if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
           curr_file_level_->num_files) {
-#ifndef NDEBUG
-        prev_file_ = nullptr;
-#endif
         batch_iter_prev_ = current_level_range_.begin();
         batch_iter_ = current_level_range_.begin();
         return true;
@@ -738,9 +719,6 @@ class FilePickerMultiGet {
         fp_ctx.curr_index_in_curr_level = start_index;
       }
       if (level_contains_keys) {
-#ifndef NDEBUG
-        prev_file_ = nullptr;
-#endif
         batch_iter_prev_ = current_level_range_.begin();
         batch_iter_ = current_level_range_.begin();
         return true;
@@ -820,14 +798,16 @@ static bool AfterFile(const Comparator* ucmp,
                       const Slice* user_key, const FdWithKeyRange* f) {
   // nullptr user_key occurs before all keys and is therefore never after *f
   return (user_key != nullptr &&
-          ucmp->Compare(*user_key, ExtractUserKey(f->largest_key)) > 0);
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->largest_key)) > 0);
 }
 
 static bool BeforeFile(const Comparator* ucmp,
                        const Slice* user_key, const FdWithKeyRange* f) {
   // nullptr user_key occurs after all keys and is therefore never before *f
   return (user_key != nullptr &&
-          ucmp->Compare(*user_key, ExtractUserKey(f->smallest_key)) < 0);
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->smallest_key)) < 0);
 }
 
 bool SomeFileOverlapsRange(
@@ -872,24 +852,26 @@ namespace {
 
 class LevelIterator final : public InternalIterator {
  public:
-  LevelIterator(
-      TableCache* table_cache, const ReadOptions& read_options,
-      const EnvOptions& env_options, const InternalKeyComparator& icomparator,
-      const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor,
-      bool should_sample, HistogramImpl* file_read_hist, bool for_compaction,
-      bool skip_filters, int level, RangeDelAggregator* range_del_agg,
-      const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
-          nullptr)
+  LevelIterator(TableCache* table_cache, const ReadOptions& read_options,
+                const EnvOptions& env_options,
+                const InternalKeyComparator& icomparator,
+                const LevelFilesBrief* flevel,
+                const SliceTransform* prefix_extractor, bool should_sample,
+                HistogramImpl* file_read_hist, TableReaderCaller caller,
+                bool skip_filters, int level, RangeDelAggregator* range_del_agg,
+                const std::vector<AtomicCompactionUnitBoundary>*
+                    compaction_boundaries = nullptr)
       : table_cache_(table_cache),
         read_options_(read_options),
         env_options_(env_options),
         icomparator_(icomparator),
         user_comparator_(icomparator.user_comparator()),
         flevel_(flevel),
-        prefix_extractor_(prefix_extractor),
+        prefix_extractor_(read_options.total_order_seek ? nullptr
+                                                        : prefix_extractor),
         file_read_hist_(file_read_hist),
         should_sample_(should_sample),
-        for_compaction_(for_compaction),
+        caller_(caller),
         skip_filters_(skip_filters),
         file_index_(flevel_->num_files),
         level_(level),
@@ -907,7 +889,7 @@ class LevelIterator final : public InternalIterator {
   void SeekToFirst() override;
   void SeekToLast() override;
   void Next() final override;
-  bool NextAndGetResult(Slice* ret_key) override;
+  bool NextAndGetResult(IterateResult* result) override;
   void Prev() override;
 
   bool Valid() const override { return file_iter_.Valid(); }
@@ -915,30 +897,46 @@ class LevelIterator final : public InternalIterator {
     assert(Valid());
     return file_iter_.key();
   }
+
   Slice value() const override {
     assert(Valid());
     return file_iter_.value();
   }
+
   Status status() const override {
     return file_iter_.iter() ? file_iter_.status() : Status::OK();
   }
+
+  inline bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
+  }
+
+  inline bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return file_iter_.MayBeOutOfUpperBound();
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
     if (file_iter_.iter()) {
       file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
     }
   }
+
   bool IsKeyPinned() const override {
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            file_iter_.iter() && file_iter_.IsKeyPinned();
   }
+
   bool IsValuePinned() const override {
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            file_iter_.iter() && file_iter_.IsValuePinned();
   }
 
  private:
-  void SkipEmptyFileForward();
+  // Return true if at least one invalid file is seen and skipped.
+  bool SkipEmptyFileForward();
   void SkipEmptyFileBackward();
   void SetFileIterator(InternalIterator* iter);
   void InitFileIterator(size_t new_file_index);
@@ -957,8 +955,9 @@ class LevelIterator final : public InternalIterator {
 
   bool KeyReachedUpperBound(const Slice& internal_key) {
     return read_options_.iterate_upper_bound != nullptr &&
-           user_comparator_.Compare(ExtractUserKey(internal_key),
-                                    *read_options_.iterate_upper_bound) >= 0;
+           user_comparator_.CompareWithoutTimestamp(
+               ExtractUserKey(internal_key),
+               *read_options_.iterate_upper_bound) >= 0;
   }
 
   InternalIterator* NewFileIterator() {
@@ -974,12 +973,27 @@ class LevelIterator final : public InternalIterator {
       smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
       largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
     }
+    CheckMayBeOutOfLowerBound();
     return table_cache_->NewIterator(
         read_options_, env_options_, icomparator_, *file_meta.file_metadata,
         range_del_agg_, prefix_extractor_,
-        nullptr /* don't need reference to table */,
-        file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_,
-        level_, smallest_compaction_key, largest_compaction_key);
+        nullptr /* don't need reference to table */, file_read_hist_, caller_,
+        /*arena=*/nullptr, skip_filters_, level_, smallest_compaction_key,
+        largest_compaction_key);
+  }
+
+  // Check if current file being fully within iterate_lower_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update may_be_out_of_lower_bound_ accordingly.
+  void CheckMayBeOutOfLowerBound() {
+    if (read_options_.iterate_lower_bound != nullptr &&
+        file_index_ < flevel_->num_files) {
+      may_be_out_of_lower_bound_ =
+          user_comparator_.Compare(
+              ExtractUserKey(file_smallest_key(file_index_)),
+              *read_options_.iterate_lower_bound) < 0;
+    }
   }
 
   TableCache* table_cache_;
@@ -993,8 +1007,9 @@ class LevelIterator final : public InternalIterator {
 
   HistogramImpl* file_read_hist_;
   bool should_sample_;
-  bool for_compaction_;
+  TableReaderCaller caller_;
   bool skip_filters_;
+  bool may_be_out_of_lower_bound_ = true;
   size_t file_index_;
   int level_;
   RangeDelAggregator* range_del_agg_;
@@ -1007,13 +1022,55 @@ class LevelIterator final : public InternalIterator {
 };
 
 void LevelIterator::Seek(const Slice& target) {
-  size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+  // Check whether the seek key fall under the same file
+  bool need_to_reseek = true;
+  if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) {
+    const FdWithKeyRange& cur_file = flevel_->files[file_index_];
+    if (icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.largest_key) <= 0 &&
+        icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.smallest_key) >= 0) {
+      need_to_reseek = false;
+      assert(static_cast<size_t>(FindFile(icomparator_, *flevel_, target)) ==
+             file_index_);
+    }
+  }
+  if (need_to_reseek) {
+    TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile");
+    size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+    InitFileIterator(new_file_index);
+  }
 
-  InitFileIterator(new_file_index);
   if (file_iter_.iter() != nullptr) {
     file_iter_.Seek(target);
   }
-  SkipEmptyFileForward();
+  if (SkipEmptyFileForward() && prefix_extractor_ != nullptr &&
+      file_iter_.iter() != nullptr && file_iter_.Valid()) {
+    // We've skipped the file we initially positioned to. In the prefix
+    // seek case, it is likely that the file is skipped because of
+    // prefix bloom or hash, where more keys are skipped. We then check
+    // the current key and invalidate the iterator if the prefix is
+    // already passed.
+    // When doing prefix iterator seek, when keys for one prefix have
+    // been exhausted, it can jump to any key that is larger. Here we are
+    // enforcing a stricter contract than that, in order to make it easier for
+    // higher layers (merging and DB iterator) to reason the correctness:
+    // 1. Within the prefix, the result should be accurate.
+    // 2. If keys for the prefix is exhausted, it is either positioned to the
+    //    next key after the prefix, or make the iterator invalid.
+    // A side benefit will be that it invalidates the iterator earlier so that
+    // the upper level merging iterator can merge fewer child iterators.
+    Slice target_user_key = ExtractUserKey(target);
+    Slice file_user_key = ExtractUserKey(file_iter_.key());
+    if (prefix_extractor_->InDomain(target_user_key) &&
+        (!prefix_extractor_->InDomain(file_user_key) ||
+         user_comparator_.Compare(
+             prefix_extractor_->Transform(target_user_key),
+             prefix_extractor_->Transform(file_user_key)) != 0)) {
+      SetFileIterator(nullptr);
+    }
+  }
+  CheckMayBeOutOfLowerBound();
 }
 
 void LevelIterator::SeekForPrev(const Slice& target) {
@@ -1027,6 +1084,7 @@ void LevelIterator::SeekForPrev(const Slice& target) {
     file_iter_.SeekForPrev(target);
     SkipEmptyFileBackward();
   }
+  CheckMayBeOutOfLowerBound();
 }
 
 void LevelIterator::SeekToFirst() {
@@ -1035,6 +1093,7 @@ void LevelIterator::SeekToFirst() {
     file_iter_.SeekToFirst();
   }
   SkipEmptyFileForward();
+  CheckMayBeOutOfLowerBound();
 }
 
 void LevelIterator::SeekToLast() {
@@ -1043,15 +1102,17 @@ void LevelIterator::SeekToLast() {
     file_iter_.SeekToLast();
   }
   SkipEmptyFileBackward();
+  CheckMayBeOutOfLowerBound();
 }
 
 void LevelIterator::Next() { NextImpl(); }
 
-bool LevelIterator::NextAndGetResult(Slice* ret_key) {
+bool LevelIterator::NextAndGetResult(IterateResult* result) {
   NextImpl();
   bool is_valid = Valid();
   if (is_valid) {
-    *ret_key = key();
+    result->key = key();
+    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
   }
   return is_valid;
 }
@@ -1062,25 +1123,28 @@ void LevelIterator::Prev() {
   SkipEmptyFileBackward();
 }
 
-void LevelIterator::SkipEmptyFileForward() {
+bool LevelIterator::SkipEmptyFileForward() {
+  bool seen_empty_file = false;
   while (file_iter_.iter() == nullptr ||
          (!file_iter_.Valid() && file_iter_.status().ok() &&
           !file_iter_.iter()->IsOutOfBound())) {
+    seen_empty_file = true;
     // Move to next file
     if (file_index_ >= flevel_->num_files - 1) {
       // Already at the last file
       SetFileIterator(nullptr);
-      return;
+      break;
     }
     if (KeyReachedUpperBound(file_smallest_key(file_index_ + 1))) {
       SetFileIterator(nullptr);
-      return;
+      break;
     }
     InitFileIterator(file_index_ + 1);
     if (file_iter_.iter() != nullptr) {
       file_iter_.SeekToFirst();
     }
   }
+  return seen_empty_file;
 }
 
 void LevelIterator::SkipEmptyFileBackward() {
@@ -1198,8 +1262,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
       new RandomAccessFileReader(
           std::move(file), file_name, nullptr /* env */, nullptr /* stats */,
           0 /* hist_type */, nullptr /* file_read_hist */,
-          nullptr /* rate_limiter */, false /* for_compaction*/,
-          ioptions->listeners));
+          nullptr /* rate_limiter */, ioptions->listeners));
   s = ReadTableProperties(
       file_reader.get(), file_meta->fd.GetFileSize(),
       Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions,
@@ -1225,6 +1288,60 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
   return Status::OK();
 }
 
+Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
+                                            std::string* out_str) {
+  if (max_entries_to_print <= 0) {
+    return Status::OK();
+  }
+  int num_entries_left = max_entries_to_print;
+
+  std::stringstream ss;
+
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.files_[level]) {
+      auto fname =
+          TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                        file_meta->fd.GetPathId());
+
+      ss << "=== file : " << fname << " ===\n";
+
+      TableCache* table_cache = cfd_->table_cache();
+      std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter;
+
+      Status s = table_cache->GetRangeTombstoneIterator(
+          ReadOptions(), cfd_->internal_comparator(), *file_meta,
+          &tombstone_iter);
+      if (!s.ok()) {
+        return s;
+      }
+      if (tombstone_iter) {
+        tombstone_iter->SeekToFirst();
+
+        while (tombstone_iter->Valid() && num_entries_left > 0) {
+          ss << "start: " << tombstone_iter->start_key().ToString(true)
+             << " end: " << tombstone_iter->end_key().ToString(true)
+             << " seq: " << tombstone_iter->seq() << '\n';
+          tombstone_iter->Next();
+          num_entries_left--;
+        }
+        if (num_entries_left <= 0) {
+          break;
+        }
+      }
+    }
+    if (num_entries_left <= 0) {
+      break;
+    }
+  }
+  assert(num_entries_left >= 0);
+  if (num_entries_left <= 0) {
+    ss << "(results may not be complete)\n";
+  }
+
+  *out_str = ss.str();
+  return Status::OK();
+}
+
 Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
                                          int level) {
   for (const auto& file_meta : storage_info_.files_[level]) {
@@ -1335,16 +1452,15 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
         assert(!ioptions->cf_paths.empty());
         file_path = ioptions->cf_paths.back().path;
       }
+      const uint64_t file_number = file->fd.GetNumber();
       files.emplace_back(SstFileMetaData{
-          MakeTableFileName("", file->fd.GetNumber()),
-          file_path,
-          static_cast<size_t>(file->fd.GetFileSize()),
-          file->fd.smallest_seqno,
-          file->fd.largest_seqno,
-          file->smallest.user_key().ToString(),
+          MakeTableFileName("", file_number), file_number, file_path,
+          static_cast<size_t>(file->fd.GetFileSize()), file->fd.smallest_seqno,
+          file->fd.largest_seqno, file->smallest.user_key().ToString(),
           file->largest.user_key().ToString(),
           file->stats.num_reads_sampled.load(std::memory_order_relaxed),
-          file->being_compacted});
+          file->being_compacted, file->oldest_blob_file_number,
+          file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime()});
       files.back().num_entries = file->num_entries;
       files.back().num_deletions = file->num_deletions;
       level_size += file->fd.GetFileSize();
@@ -1365,6 +1481,24 @@ uint64_t Version::GetSstFilesSize() {
   return sst_files_size;
 }
 
+void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  uint64_t oldest_time = port::kMaxUint64;
+  for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
+    for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
+      assert(meta->fd.table_reader != nullptr);
+      uint64_t file_creation_time = meta->TryGetFileCreationTime();
+      if (file_creation_time == kUnknownFileCreationTime) {
+        *creation_time = 0;
+        return;
+      }
+      if (file_creation_time < oldest_time) {
+        oldest_time = file_creation_time;
+      }
+    }
+  }
+  *creation_time = oldest_time;
+}
+
 uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
   // Estimation will be inaccurate when:
   // (1) there exist merge keys
@@ -1446,10 +1580,14 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
     for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
       const auto& file = storage_info_.LevelFilesBrief(0).files[i];
       merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
-          read_options, soptions, cfd_->internal_comparator(), *file.file_metadata,
-          range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr,
-          cfd_->internal_stats()->GetFileReadHist(0), false, arena,
-          false /* skip_filters */, 0 /* level */));
+          read_options, soptions, cfd_->internal_comparator(),
+          *file.file_metadata, range_del_agg,
+          mutable_cf_options_.prefix_extractor.get(), nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, arena,
+          /*skip_filters=*/false, /*level=*/0,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr));
     }
     if (should_sample) {
       // Count ones for every L0 files. This is done per iterator creation
@@ -1470,8 +1608,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
         cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
         mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
         cfd_->internal_stats()->GetFileReadHist(level),
-        false /* for_compaction */, IsFilterSkipped(level), level,
-        range_del_agg));
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+        range_del_agg, /*largest_compaction_key=*/nullptr));
   }
 }
 
@@ -1500,10 +1638,14 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
         continue;
       }
       ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
-          read_options, env_options, cfd_->internal_comparator(), *file->file_metadata,
-          &range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr,
-          cfd_->internal_stats()->GetFileReadHist(0), false, &arena,
-          false /* skip_filters */, 0 /* level */));
+          read_options, env_options, cfd_->internal_comparator(),
+          *file->file_metadata, &range_del_agg,
+          mutable_cf_options_.prefix_extractor.get(), nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, &arena,
+          /*skip_filters=*/false, /*level=*/0,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr));
       status = OverlapWithIterator(
           ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
       if (!status.ok() || *overlap) {
@@ -1517,7 +1659,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
         cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
         mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
         cfd_->internal_stats()->GetFileReadHist(level),
-        false /* for_compaction */, IsFilterSkipped(level), level,
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
         &range_del_agg));
     status = OverlapWithIterator(
         ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
@@ -1611,7 +1753,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
                   MergeContext* merge_context,
                   SequenceNumber* max_covering_tombstone_seq, bool* value_found,
                   bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
-                  bool* is_blob) {
+                  bool* is_blob, bool do_merge) {
   Slice ikey = k.internal_key();
   Slice user_key = k.user_key();
 
@@ -1623,11 +1765,18 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
   }
 
   PinnedIteratorsManager pinned_iters_mgr;
+  uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_get_id = vset_->block_cache_tracer_->NextGetId();
+  }
   GetContext get_context(
       user_comparator(), merge_operator_, info_log_, db_statistics_,
       status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
-      value, value_found, merge_context, max_covering_tombstone_seq, this->env_,
-      seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob);
+      do_merge ? value : nullptr, value_found, merge_context, do_merge,
+      max_covering_tombstone_seq, this->env_, seq,
+      merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
+      tracing_get_id);
 
   // Pin blocks that we read to hold merge operands
   if (merge_operator_) {
@@ -1691,7 +1840,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
         } else if (fp.GetHitFileLevel() >= 2) {
           RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
         }
-        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel());
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
+                                  fp.GetHitFileLevel());
         return;
       case GetContext::kDeleted:
         // Use empty error message for speed
@@ -1709,11 +1859,14 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
     }
     f = fp.GetNextFile();
   }
-
   if (db_statistics_ != nullptr) {
     get_context.ReportCounters();
   }
   if (GetContext::kMerge == get_context.State()) {
+    if (!do_merge) {
+      *status = Status::OK();
+      return;
+    }
     if (!merge_operator_) {
       *status =  Status::InvalidArgument(
           "merge_operator is not properly initialized.");
@@ -1745,7 +1898,12 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
   if (merge_operator_) {
     pinned_iters_mgr.StartPinning();
   }
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
 
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_mget_id = vset_->block_cache_tracer_->NextGetId();
+  }
   // Even though we know the batch size won't be > MAX_BATCH_SIZE,
   // use autovector in order to avoid unnecessary construction of GetContext
   // objects, which is expensive
@@ -1755,15 +1913,20 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
     get_ctx.emplace_back(
         user_comparator(), merge_operator_, info_log_, db_statistics_,
         iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey,
-        iter->value, nullptr, &(iter->merge_context),
-        &iter->max_covering_tombstone_seq, this->env_, &iter->seq,
-        merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob);
-    iter->get_context = &get_ctx.back();
+        iter->value, nullptr, &(iter->merge_context), true,
+        &iter->max_covering_tombstone_seq, this->env_, nullptr,
+        merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
+        tracing_mget_id);
+  }
+  int get_ctx_index = 0;
+  for (auto iter = range->begin(); iter != range->end();
+       ++iter, get_ctx_index++) {
+    iter->get_context = &(get_ctx[get_ctx_index]);
   }
 
   MultiGetRange file_picker_range(*range, range->begin(), range->end());
   FilePickerMultiGet fp(
-      storage_info_.files_, &file_picker_range,
+      &file_picker_range,
       &storage_info_.level_files_brief_, storage_info_.num_non_empty_levels_,
       &storage_info_.file_indexer_, user_comparator(), internal_comparator());
   FdWithKeyRange* f = fp.GetNextFile();
@@ -2167,13 +2330,11 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
   auto status = ioptions.env->GetCurrentTime(&_current_time);
   if (status.ok()) {
     const uint64_t current_time = static_cast<uint64_t>(_current_time);
-    for (auto f : files) {
-      if (!f->being_compacted && f->fd.table_reader != nullptr &&
-          f->fd.table_reader->GetTableProperties() != nullptr) {
-        auto creation_time =
-            f->fd.table_reader->GetTableProperties()->creation_time;
-        if (creation_time > 0 &&
-            creation_time < (current_time - mutable_cf_options.ttl)) {
+    for (FileMetaData* f : files) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time != 0 &&
+            oldest_ancester_time < (current_time - mutable_cf_options.ttl)) {
           ttl_expired_files_count++;
         }
       }
@@ -2325,12 +2486,11 @@ void VersionStorageInfo::ComputeExpiredTtlFiles(
   const uint64_t current_time = static_cast<uint64_t>(_current_time);
 
   for (int level = 0; level < num_levels() - 1; level++) {
-    for (auto f : files_[level]) {
-      if (!f->being_compacted && f->fd.table_reader != nullptr &&
-          f->fd.table_reader->GetTableProperties() != nullptr) {
-        auto creation_time =
-            f->fd.table_reader->GetTableProperties()->creation_time;
-        if (creation_time > 0 && creation_time < (current_time - ttl)) {
+    for (FileMetaData* f : files_[level]) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time > 0 &&
+            oldest_ancester_time < (current_time - ttl)) {
           expired_ttl_files_.emplace_back(level, f);
         }
       }
@@ -2351,26 +2511,30 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
     return;
   }
   const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+
+  // If periodic_compaction_seconds is larger than current time, periodic
+  // compaction can't possibly be triggered.
+  if (periodic_compaction_seconds > current_time) {
+    return;
+  }
+
   const uint64_t allowed_time_limit =
       current_time - periodic_compaction_seconds;
 
   for (int level = 0; level < num_levels(); level++) {
     for (auto f : files_[level]) {
-      if (!f->being_compacted && f->fd.table_reader != nullptr &&
-          f->fd.table_reader->GetTableProperties() != nullptr) {
+      if (!f->being_compacted) {
         // Compute a file's modification time in the following order:
         // 1. Use file_creation_time table property if it is > 0.
         // 2. Use creation_time table property if it is > 0.
         // 3. Use file's mtime metadata if the above two table properties are 0.
         // Don't consider the file at all if the modification time cannot be
         // correctly determined based on the above conditions.
-        uint64_t file_modification_time =
-            f->fd.table_reader->GetTableProperties()->file_creation_time;
-        if (file_modification_time == 0) {
-          file_modification_time =
-              f->fd.table_reader->GetTableProperties()->creation_time;
+        uint64_t file_modification_time = f->TryGetFileCreationTime();
+        if (file_modification_time == kUnknownFileCreationTime) {
+          file_modification_time = f->TryGetOldestAncesterTime();
         }
-        if (file_modification_time == 0) {
+        if (file_modification_time == kUnknownOldestAncesterTime) {
           auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
                                          f->fd.GetPathId());
           status = ioptions.env->GetFileModificationTime(
@@ -2753,11 +2917,12 @@ void VersionStorageInfo::GetOverlappingInputs(
       FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
       const Slice file_start = ExtractUserKey(f->smallest_key);
       const Slice file_limit = ExtractUserKey(f->largest_key);
-      if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
+      if (begin != nullptr &&
+          user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) {
         // "f" is completely before specified range; skip it
         iter++;
       } else if (end != nullptr &&
-                 user_cmp->Compare(file_start, user_end) > 0) {
+                 user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) {
         // "f" is completely after specified range; skip it
         iter++;
       } else {
@@ -2772,10 +2937,11 @@ void VersionStorageInfo::GetOverlappingInputs(
         iter = index.erase(iter);
         if (expand_range) {
           if (begin != nullptr &&
-              user_cmp->Compare(file_start, user_begin) < 0) {
+              user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) {
             user_begin = file_start;
           }
-          if (end != nullptr && user_cmp->Compare(file_limit, user_end) > 0) {
+          if (end != nullptr &&
+              user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) {
             user_end = file_limit;
           }
         }
@@ -3098,7 +3264,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
         // base_bytes_min. We set it be base_bytes_min.
         base_level_size = base_bytes_min + 1U;
         base_level_ = first_non_empty_level;
-        ROCKS_LOG_WARN(ioptions.info_log,
+        ROCKS_LOG_INFO(ioptions.info_log,
                        "More existing levels in DB than needed. "
                        "max_bytes_for_level_multiplier may not be guaranteed.");
       } else {
@@ -3261,6 +3427,10 @@ std::string Version::DebugString(bool hex, bool print_stats) const {
       r.append(" .. ");
       r.append(files[i]->largest.DebugString(hex));
       r.append("]");
+      if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) {
+        r.append(" blob_file:");
+        AppendNumberTo(&r, files[i]->oldest_blob_file_number);
+      }
       if (print_stats) {
         r.append("(");
         r.append(ToString(
@@ -3292,14 +3462,60 @@ struct VersionSet::ManifestWriter {
         edit_list(e) {}
 };
 
+Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
+  assert(edit);
+  if (edit->is_in_atomic_group_) {
+    TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup");
+    if (replay_buffer_.empty()) {
+      replay_buffer_.resize(edit->remaining_entries_ + 1);
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit);
+    }
+    read_edits_in_atomic_group_++;
+    if (read_edits_in_atomic_group_ + edit->remaining_entries_ !=
+        static_cast<uint32_t>(replay_buffer_.size())) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit);
+      return Status::Corruption("corrupted atomic group");
+    }
+    replay_buffer_[read_edits_in_atomic_group_ - 1] = std::move(*edit);
+    if (read_edits_in_atomic_group_ == replay_buffer_.size()) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit);
+      return Status::OK();
+    }
+    return Status::OK();
+  }
+
+  // A normal edit.
+  if (!replay_buffer().empty()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit);
+    return Status::Corruption("corrupted atomic group");
+  }
+  return Status::OK();
+}
+
+bool AtomicGroupReadBuffer::IsFull() const {
+  return read_edits_in_atomic_group_ == replay_buffer_.size();
+}
+
+bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); }
+
+void AtomicGroupReadBuffer::Clear() {
+  read_edits_in_atomic_group_ = 0;
+  replay_buffer_.clear();
+}
+
 VersionSet::VersionSet(const std::string& dbname,
                        const ImmutableDBOptions* _db_options,
                        const EnvOptions& storage_options, Cache* table_cache,
                        WriteBufferManager* write_buffer_manager,
-                       WriteController* write_controller)
-    : column_family_set_(
-          new ColumnFamilySet(dbname, _db_options, storage_options, table_cache,
-                              write_buffer_manager, write_controller)),
+                       WriteController* write_controller,
+                       BlockCacheTracer* const block_cache_tracer)
+    : column_family_set_(new ColumnFamilySet(
+          dbname, _db_options, storage_options, table_cache,
+          write_buffer_manager, write_controller, block_cache_tracer)),
       env_(_db_options->env),
       dbname_(dbname),
       db_options_(_db_options),
@@ -3313,18 +3529,13 @@ VersionSet::VersionSet(const std::string& dbname,
       prev_log_number_(0),
       current_version_number_(0),
       manifest_file_size_(0),
-      env_options_(storage_options) {}
-
-void CloseTables(void* ptr, size_t) {
-  TableReader* table_reader = reinterpret_cast<TableReader*>(ptr);
-  table_reader->Close();
-}
+      env_options_(storage_options),
+      block_cache_tracer_(block_cache_tracer) {}
 
 VersionSet::~VersionSet() {
   // we need to delete column_family_set_ because its destructor depends on
   // VersionSet
   Cache* table_cache = column_family_set_->get_table_cache();
-  table_cache->ApplyToAllCacheEntries(&CloseTables, false /* thread_safe */);
   column_family_set_.reset();
   for (auto& file : obsolete_files_) {
     if (file.metadata->table_reader_handle) {
@@ -3464,7 +3675,14 @@ Status VersionSet::ProcessManifestWrites(
         } else if (group_start != std::numeric_limits<size_t>::max()) {
           group_start = std::numeric_limits<size_t>::max();
         }
-        LogAndApplyHelper(last_writer->cfd, builder, e, mu);
+        Status s = LogAndApplyHelper(last_writer->cfd, builder, e, mu);
+        if (!s.ok()) {
+          // free up the allocated memory
+          for (auto v : versions) {
+            delete v;
+          }
+          return s;
+        }
         batch_edits.push_back(e);
       }
     }
@@ -3472,7 +3690,14 @@ Status VersionSet::ProcessManifestWrites(
       assert(!builder_guards.empty() &&
              builder_guards.size() == versions.size());
       auto* builder = builder_guards[i]->version_builder();
-      builder->SaveTo(versions[i]->storage_info());
+      Status s = builder->SaveTo(versions[i]->storage_info());
+      if (!s.ok()) {
+        // free up the allocated memory
+        for (auto v : versions) {
+          delete v;
+        }
+        return s;
+      }
     }
   }
 
@@ -3575,7 +3800,7 @@ Status VersionSet::ProcessManifestWrites(
             nullptr, db_options_->listeners));
         descriptor_log_.reset(
             new log::Writer(std::move(file_writer), 0, false));
-        s = WriteSnapshot(descriptor_log_.get());
+        s = WriteCurrentStateToManifest(descriptor_log_.get());
       }
     }
 
@@ -3601,8 +3826,9 @@ Status VersionSet::ProcessManifestWrites(
                          rocksdb_kill_odds * REDUCE_ODDS2);
 #ifndef NDEBUG
         if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
-          TEST_SYNC_POINT(
-              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0");
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+              nullptr);
           TEST_SYNC_POINT(
               "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
         }
@@ -3821,8 +4047,6 @@ Status VersionSet::LogAndApply(
     }
   }
   if (0 == num_undropped_cfds) {
-    // TODO (yanqin) maybe use a different status code to denote column family
-    // drop other than OK and ShutdownInProgress
     for (int i = 0; i != num_cfds; ++i) {
       manifest_writers_.pop_front();
     }
@@ -3830,7 +4054,7 @@ Status VersionSet::LogAndApply(
     if (!manifest_writers_.empty()) {
       manifest_writers_.front()->cv.Signal();
     }
-    return Status::ShutdownInProgress();
+    return Status::ColumnFamilyDropped();
   }
 
   return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log,
@@ -3854,9 +4078,9 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
   }
 }
 
-void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
-                                   VersionBuilder* builder, VersionEdit* edit,
-                                   InstrumentedMutex* mu) {
+Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
+                                     VersionBuilder* builder, VersionEdit* edit,
+                                     InstrumentedMutex* mu) {
 #ifdef NDEBUG
   (void)cfd;
 #endif
@@ -3880,7 +4104,9 @@ void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
   edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
                                                       : last_sequence_);
 
-  builder->Apply(edit);
+  Status s = builder->Apply(edit);
+
+  return s;
 }
 
 Status VersionSet::ApplyOneVersionEditToBuilder(
@@ -3889,10 +4115,7 @@ Status VersionSet::ApplyOneVersionEditToBuilder(
     std::unordered_map<int, std::string>& column_families_not_found,
     std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
         builders,
-    bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
-    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-    bool* have_last_sequence, SequenceNumber* last_sequence,
-    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+    VersionEditParams* version_edit_params) {
   // Not found means that user didn't supply that column
   // family option AND we encountered column family add
   // record. Once we encounter column family drop record,
@@ -3916,11 +4139,23 @@ Status VersionSet::ApplyOneVersionEditToBuilder(
           edit.column_family_name_);
     }
     auto cf_options = name_to_options.find(edit.column_family_name_);
-    if (cf_options == name_to_options.end()) {
+    // implicitly add persistent_stats column family without requiring user
+    // to specify
+    bool is_persistent_stats_column_family =
+        edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
+    if (cf_options == name_to_options.end() &&
+        !is_persistent_stats_column_family) {
       column_families_not_found.insert(
           {edit.column_family_, edit.column_family_name_});
     } else {
-      cfd = CreateColumnFamily(cf_options->second, &edit);
+      // recover persistent_stats CF from a DB that already contains it
+      if (is_persistent_stats_column_family) {
+        ColumnFamilyOptions cfo;
+        OptimizeForPersistentStats(&cfo);
+        cfd = CreateColumnFamily(cfo, &edit);
+      } else {
+        cfd = CreateColumnFamily(cf_options->second, &edit);
+      }
       cfd->set_initialized();
       builders.insert(std::make_pair(
           edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
@@ -3961,71 +4196,73 @@ Status VersionSet::ApplyOneVersionEditToBuilder(
     // to builder
     auto builder = builders.find(edit.column_family_);
     assert(builder != builders.end());
-    builder->second->version_builder()->Apply(&edit);
+    Status s = builder->second->version_builder()->Apply(&edit);
+    if (!s.ok()) {
+      return s;
+    }
   }
-  return ExtractInfoFromVersionEdit(
-      cfd, edit, have_log_number, log_number, have_prev_log_number,
-      previous_log_number, have_next_file, next_file, have_last_sequence,
-      last_sequence, min_log_number_to_keep, max_column_family);
+  return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params);
 }
 
 Status VersionSet::ExtractInfoFromVersionEdit(
-    ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number,
-    uint64_t* log_number, bool* have_prev_log_number,
-    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-    bool* have_last_sequence, SequenceNumber* last_sequence,
-    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+    ColumnFamilyData* cfd, const VersionEdit& from_edit,
+    VersionEditParams* version_edit_params) {
   if (cfd != nullptr) {
-    if (edit.has_log_number_) {
-      if (cfd->GetLogNumber() > edit.log_number_) {
+    if (from_edit.has_db_id_) {
+      version_edit_params->SetDBId(from_edit.db_id_);
+    }
+    if (from_edit.has_log_number_) {
+      if (cfd->GetLogNumber() > from_edit.log_number_) {
         ROCKS_LOG_WARN(
             db_options_->info_log,
             "MANIFEST corruption detected, but ignored - Log numbers in "
             "records NOT monotonically increasing");
       } else {
-        cfd->SetLogNumber(edit.log_number_);
-        *have_log_number = true;
-        *log_number = edit.log_number_;
+        cfd->SetLogNumber(from_edit.log_number_);
+        version_edit_params->SetLogNumber(from_edit.log_number_);
       }
     }
-    if (edit.has_comparator_ &&
-        edit.comparator_ != cfd->user_comparator()->Name()) {
+    if (from_edit.has_comparator_ &&
+        from_edit.comparator_ != cfd->user_comparator()->Name()) {
       return Status::InvalidArgument(
           cfd->user_comparator()->Name(),
-          "does not match existing comparator " + edit.comparator_);
+          "does not match existing comparator " + from_edit.comparator_);
     }
   }
 
-  if (edit.has_prev_log_number_) {
-    *previous_log_number = edit.prev_log_number_;
-    *have_prev_log_number = true;
+  if (from_edit.has_prev_log_number_) {
+    version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_);
   }
 
-  if (edit.has_next_file_number_) {
-    *next_file = edit.next_file_number_;
-    *have_next_file = true;
+  if (from_edit.has_next_file_number_) {
+    version_edit_params->SetNextFile(from_edit.next_file_number_);
   }
 
-  if (edit.has_max_column_family_) {
-    *max_column_family = edit.max_column_family_;
+  if (from_edit.has_max_column_family_) {
+    version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_);
   }
 
-  if (edit.has_min_log_number_to_keep_) {
-    *min_log_number_to_keep =
-        std::max(*min_log_number_to_keep, edit.min_log_number_to_keep_);
+  if (from_edit.has_min_log_number_to_keep_) {
+    version_edit_params->min_log_number_to_keep_ =
+        std::max(version_edit_params->min_log_number_to_keep_,
+                 from_edit.min_log_number_to_keep_);
   }
 
-  if (edit.has_last_sequence_) {
-    *last_sequence = edit.last_sequence_;
-    *have_last_sequence = true;
+  if (from_edit.has_last_sequence_) {
+    version_edit_params->SetLastSequence(from_edit.last_sequence_);
   }
   return Status::OK();
 }
 
-Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) {
+Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Env* env,
+                                          std::string* manifest_path,
+                                          uint64_t* manifest_file_number) {
+  assert(env != nullptr);
   assert(manifest_path != nullptr);
+  assert(manifest_file_number != nullptr);
+
   std::string fname;
-  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &fname);
+  Status s = ReadFileToString(env, CurrentFileName(dbname), &fname);
   if (!s.ok()) {
     return s;
   }
@@ -4035,21 +4272,87 @@ Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) {
   // remove the trailing '\n'
   fname.resize(fname.size() - 1);
   FileType type;
-  bool parse_ok = ParseFileName(fname, &manifest_file_number_, &type);
+  bool parse_ok = ParseFileName(fname, manifest_file_number, &type);
   if (!parse_ok || type != kDescriptorFile) {
     return Status::Corruption("CURRENT file corrupted");
   }
-  *manifest_path = dbname_;
-  if (dbname_.back() != '/') {
+  *manifest_path = dbname;
+  if (dbname.back() != '/') {
     manifest_path->push_back('/');
   }
   *manifest_path += fname;
   return Status::OK();
 }
 
+Status VersionSet::ReadAndRecover(
+    log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
+    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
+    std::unordered_map<int, std::string>& column_families_not_found,
+    std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
+        builders,
+    VersionEditParams* version_edit_params, std::string* db_id) {
+  assert(reader != nullptr);
+  assert(read_buffer != nullptr);
+  Status s;
+  Slice record;
+  std::string scratch;
+  size_t recovered_edits = 0;
+  while (reader->ReadRecord(&record, &scratch) && s.ok()) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      break;
+    }
+    if (edit.has_db_id_) {
+      db_id_ = edit.GetDbId();
+      if (db_id != nullptr) {
+        db_id->assign(edit.GetDbId());
+      }
+    }
+    s = read_buffer->AddEdit(&edit);
+    if (!s.ok()) {
+      break;
+    }
+    if (edit.is_in_atomic_group_) {
+      if (read_buffer->IsFull()) {
+        // Apply edits in an atomic group when we have read all edits in the
+        // group.
+        for (auto& e : read_buffer->replay_buffer()) {
+          s = ApplyOneVersionEditToBuilder(e, name_to_options,
+                                           column_families_not_found, builders,
+                                           version_edit_params);
+          if (!s.ok()) {
+            break;
+          }
+          recovered_edits++;
+        }
+        if (!s.ok()) {
+          break;
+        }
+        read_buffer->Clear();
+      }
+    } else {
+      // Apply a normal edit immediately.
+      s = ApplyOneVersionEditToBuilder(edit, name_to_options,
+                                       column_families_not_found, builders,
+                                       version_edit_params);
+      if (s.ok()) {
+        recovered_edits++;
+      }
+    }
+  }
+  if (!s.ok()) {
+    // Clear the buffer if we fail to decode/apply an edit.
+    read_buffer->Clear();
+  }
+  TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits",
+                           &recovered_edits);
+  return s;
+}
+
 Status VersionSet::Recover(
-    const std::vector<ColumnFamilyDescriptor>& column_families,
-    bool read_only) {
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    std::string* db_id) {
   std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
   for (auto cf : column_families) {
     cf_name_to_options.insert({cf.name, cf.options});
@@ -4061,7 +4364,8 @@ Status VersionSet::Recover(
 
   // Read "CURRENT" file, which contains a pointer to the current manifest file
   std::string manifest_path;
-  Status s = GetCurrentManifestPath(&manifest_path);
+  Status s = GetCurrentManifestPath(dbname_, env_, &manifest_path,
+                                    &manifest_file_number_);
   if (!s.ok()) {
     return s;
   }
@@ -4078,7 +4382,8 @@ Status VersionSet::Recover(
       return s;
     }
     manifest_file_reader.reset(
-        new SequentialFileReader(std::move(manifest_file), manifest_path));
+        new SequentialFileReader(std::move(manifest_file), manifest_path,
+                                 db_options_->log_readahead_size));
   }
   uint64_t current_manifest_file_size;
   s = env_->GetFileSize(manifest_path, &current_manifest_file_size);
@@ -4086,16 +4391,6 @@ Status VersionSet::Recover(
     return s;
   }
 
-  bool have_log_number = false;
-  bool have_prev_log_number = false;
-  bool have_next_file = false;
-  bool have_last_sequence = false;
-  uint64_t next_file = 0;
-  uint64_t last_sequence = 0;
-  uint64_t log_number = 0;
-  uint64_t previous_log_number = 0;
-  uint32_t max_column_family = 0;
-  uint64_t min_log_number_to_keep = 0;
   std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
       builders;
 
@@ -4115,7 +4410,7 @@ Status VersionSet::Recover(
   builders.insert(
       std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
                             new BaseReferencedVersionBuilder(default_cfd))));
-
+  VersionEditParams version_edit_params;
   {
     VersionSet::LogReporter reporter;
     reporter.status = &s;
@@ -4123,88 +4418,33 @@ Status VersionSet::Recover(
                        true /* checksum */, 0 /* log_number */);
     Slice record;
     std::string scratch;
-    std::vector<VersionEdit> replay_buffer;
-    size_t num_entries_decoded = 0;
-    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit;
-      s = edit.DecodeFrom(record);
-      if (!s.ok()) {
-        break;
-      }
-
-      if (edit.is_in_atomic_group_) {
-        if (replay_buffer.empty()) {
-          replay_buffer.resize(edit.remaining_entries_ + 1);
-          TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:FirstInAtomicGroup",
-                                   &edit);
-        }
-        ++num_entries_decoded;
-        if (num_entries_decoded + edit.remaining_entries_ !=
-            static_cast<uint32_t>(replay_buffer.size())) {
-          TEST_SYNC_POINT_CALLBACK(
-              "VersionSet::Recover:IncorrectAtomicGroupSize", &edit);
-          s = Status::Corruption("corrupted atomic group");
-          break;
-        }
-        replay_buffer[num_entries_decoded - 1] = std::move(edit);
-        if (num_entries_decoded == replay_buffer.size()) {
-          TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:LastInAtomicGroup",
-                                   &edit);
-          for (auto& e : replay_buffer) {
-            s = ApplyOneVersionEditToBuilder(
-                e, cf_name_to_options, column_families_not_found, builders,
-                &have_log_number, &log_number, &have_prev_log_number,
-                &previous_log_number, &have_next_file, &next_file,
-                &have_last_sequence, &last_sequence, &min_log_number_to_keep,
-                &max_column_family);
-            if (!s.ok()) {
-              break;
-            }
-          }
-          replay_buffer.clear();
-          num_entries_decoded = 0;
-        }
-        TEST_SYNC_POINT("VersionSet::Recover:AtomicGroup");
-      } else {
-        if (!replay_buffer.empty()) {
-          TEST_SYNC_POINT_CALLBACK(
-              "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", &edit);
-          s = Status::Corruption("corrupted atomic group");
-          break;
-        }
-        s = ApplyOneVersionEditToBuilder(
-            edit, cf_name_to_options, column_families_not_found, builders,
-            &have_log_number, &log_number, &have_prev_log_number,
-            &previous_log_number, &have_next_file, &next_file,
-            &have_last_sequence, &last_sequence, &min_log_number_to_keep,
-            &max_column_family);
-      }
-      if (!s.ok()) {
-        break;
-      }
-    }
+    AtomicGroupReadBuffer read_buffer;
+    s = ReadAndRecover(&reader, &read_buffer, cf_name_to_options,
+                       column_families_not_found, builders,
+                       &version_edit_params, db_id);
   }
 
   if (s.ok()) {
-    if (!have_next_file) {
+    if (!version_edit_params.has_next_file_number_) {
       s = Status::Corruption("no meta-nextfile entry in descriptor");
-    } else if (!have_log_number) {
+    } else if (!version_edit_params.has_log_number_) {
       s = Status::Corruption("no meta-lognumber entry in descriptor");
-    } else if (!have_last_sequence) {
+    } else if (!version_edit_params.has_last_sequence_) {
       s = Status::Corruption("no last-sequence-number entry in descriptor");
     }
 
-    if (!have_prev_log_number) {
-      previous_log_number = 0;
+    if (!version_edit_params.has_prev_log_number_) {
+      version_edit_params.SetPrevLogNumber(0);
     }
 
-    column_family_set_->UpdateMaxColumnFamily(max_column_family);
+    column_family_set_->UpdateMaxColumnFamily(
+        version_edit_params.max_column_family_);
 
     // When reading DB generated using old release, min_log_number_to_keep=0.
     // All log files will be scanned for potential prepare entries.
-    MarkMinLogNumberToKeep2PC(min_log_number_to_keep);
-    MarkFileNumberUsed(previous_log_number);
-    MarkFileNumberUsed(log_number);
+    MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_);
+    MarkFileNumberUsed(version_edit_params.prev_log_number_);
+    MarkFileNumberUsed(version_edit_params.log_number_);
   }
 
   // there were some column families in the MANIFEST that weren't specified
@@ -4265,11 +4505,11 @@ Status VersionSet::Recover(
     }
 
     manifest_file_size_ = current_manifest_file_size;
-    next_file_number_.store(next_file + 1);
-    last_allocated_sequence_ = last_sequence;
-    last_published_sequence_ = last_sequence;
-    last_sequence_ = last_sequence;
-    prev_log_number_ = previous_log_number;
+    next_file_number_.store(version_edit_params.next_file_number_ + 1);
+    last_allocated_sequence_ = version_edit_params.last_sequence_;
+    last_published_sequence_ = version_edit_params.last_sequence_;
+    last_sequence_ = version_edit_params.last_sequence_;
+    prev_log_number_ = version_edit_params.prev_log_number_;
 
     ROCKS_LOG_INFO(
         db_options_->info_log,
@@ -4278,8 +4518,8 @@ Status VersionSet::Recover(
         ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
         ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
         ",min_log_number_to_keep is %" PRIu64 "\n",
-        manifest_path.c_str(), manifest_file_number_,
-        next_file_number_.load(), last_sequence_.load(), log_number,
+        manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
+        last_sequence_.load(), version_edit_params.log_number_,
         prev_log_number_, column_family_set_->GetMaxColumnFamily(),
         min_log_number_to_keep_2pc());
 
@@ -4303,26 +4543,22 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
   // so we're fine using the defaults
   EnvOptions soptions;
   // Read "CURRENT" file, which contains a pointer to the current manifest file
-  std::string current;
-  Status s = ReadFileToString(env, CurrentFileName(dbname), &current);
+  std::string manifest_path;
+  uint64_t manifest_file_number;
+  Status s = GetCurrentManifestPath(dbname, env, &manifest_path,
+                                    &manifest_file_number);
   if (!s.ok()) {
     return s;
   }
-  if (current.empty() || current[current.size()-1] != '\n') {
-    return Status::Corruption("CURRENT file does not end with newline");
-  }
-  current.resize(current.size() - 1);
-
-  std::string dscname = dbname + "/" + current;
 
   std::unique_ptr<SequentialFileReader> file_reader;
   {
     std::unique_ptr<SequentialFile> file;
-    s = env->NewSequentialFile(dscname, &file, soptions);
+    s = env->NewSequentialFile(manifest_path, &file, soptions);
     if (!s.ok()) {
       return s;
   }
-  file_reader.reset(new SequentialFileReader(std::move(file), dscname));
+  file_reader.reset(new SequentialFileReader(std::move(file), manifest_path));
   }
 
   std::map<uint32_t, std::string> column_family_names;
@@ -4385,7 +4621,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
                                         options->table_cache_numshardbits));
   WriteController wc(options->delayed_write_rate);
   WriteBufferManager wb(options->db_write_buffer_size);
-  VersionSet versions(dbname, &db_options, env_options, tc.get(), &wb, &wc);
+  VersionSet versions(dbname, &db_options, env_options, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr);
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
@@ -4429,7 +4666,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   }
 
   // we need to allocate an array with the old number of levels size to
-  // avoid SIGSEGV in WriteSnapshot()
+  // avoid SIGSEGV in WriteCurrentStatetoManifest()
   // however, all levels bigger or equal to new_levels will be empty
   std::vector<FileMetaData*>* new_files_list =
       new std::vector<FileMetaData*>[current_levels];
@@ -4466,7 +4703,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
     if (!s.ok()) {
       return s;
     }
-    file_reader.reset(new SequentialFileReader(std::move(file), dscname));
+    file_reader.reset(new SequentialFileReader(
+        std::move(file), dscname, db_options_->log_readahead_size));
   }
 
   bool have_prev_log_number = false;
@@ -4562,7 +4800,10 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
         // to builder
         auto builder = builders.find(edit.column_family_);
         assert(builder != builders.end());
-        builder->second->version_builder()->Apply(&edit);
+        s = builder->second->version_builder()->Apply(&edit);
+        if (!s.ok()) {
+          break;
+        }
       }
 
       if (cfd != nullptr && edit.has_log_number_) {
@@ -4665,7 +4906,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
     next_file_number_.store(number + 1, std::memory_order_relaxed);
   }
 }
-
 // Called only either from ::LogAndApply which is protected by mutex or during
 // recovery which is single-threaded.
 void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
@@ -4674,7 +4914,7 @@ void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
   }
 }
 
-Status VersionSet::WriteSnapshot(log::Writer* log) {
+Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
   // WARNING: This method doesn't hold a mutex!!
@@ -4682,6 +4922,22 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
   // This is done without DB mutex lock held, but only within single-threaded
   // LogAndApply. Column family manipulations can only happen within LogAndApply
   // (the same single thread), so we're safe to iterate.
+
+  if (db_options_->write_dbid_to_manifest) {
+    VersionEdit edit_for_db_id;
+    assert(!db_id_.empty());
+    edit_for_db_id.SetDBId(db_id_);
+    std::string db_id_record;
+    if (!edit_for_db_id.EncodeTo(&db_id_record)) {
+      return Status::Corruption("Unable to Encode VersionEdit:" +
+                                edit_for_db_id.DebugString(true));
+    }
+    Status add_record = log->AddRecord(db_id_record);
+    if (!add_record.ok()) {
+      return add_record;
+    }
+  }
+
   for (auto cfd : *column_family_set_) {
     if (cfd->IsDropped()) {
       continue;
@@ -4720,7 +4976,8 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->fd.smallest_seqno, f->fd.largest_seqno,
-                       f->marked_for_compaction);
+                       f->marked_for_compaction, f->oldest_blob_file_number,
+                       f->oldest_ancester_time, f->file_creation_time);
         }
       }
       edit.SetLogNumber(cfd->GetLogNumber());
@@ -4735,7 +4992,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
       }
     }
   }
-
   return Status::OK();
 }
 
@@ -4745,111 +5001,198 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
 // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
 // we avoid doing binary search for the keys b and c twice and instead somehow
 // maintain state of where they first appear in the files.
-uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
+uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
+                                     Version* v, const Slice& start,
                                      const Slice& end, int start_level,
-                                     int end_level) {
+                                     int end_level, TableReaderCaller caller) {
+  const auto& icmp = v->cfd_->internal_comparator();
+
   // pre-condition
-  assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
+  assert(icmp.Compare(start, end) <= 0);
 
-  uint64_t size = 0;
+  uint64_t total_full_size = 0;
   const auto* vstorage = v->storage_info();
-  end_level = end_level == -1
-                  ? vstorage->num_non_empty_levels()
-                  : std::min(end_level, vstorage->num_non_empty_levels());
+  const int num_non_empty_levels = vstorage->num_non_empty_levels();
+  end_level = (end_level == -1) ? num_non_empty_levels
+                                : std::min(end_level, num_non_empty_levels);
 
   assert(start_level <= end_level);
 
-  for (int level = start_level; level < end_level; level++) {
+  // Outline of the optimization that uses options.files_size_error_margin.
+  // When approximating the files total size that is used to store a keys range,
+  // we first sum up the sizes of the files that fully fall into the range.
+  // Then we sum up the sizes of all the files that may intersect with the range
+  // (this includes all files in L0 as well). Then, if total_intersecting_size
+  // is smaller than total_full_size * options.files_size_error_margin - we can
+  // infer that the intersecting files have a sufficiently negligible
+  // contribution to the total size, and we can approximate the storage required
+  // for the keys in range as just half of the intersecting_files_size.
+  // E.g., if the value of files_size_error_margin is 0.1, then the error of the
+  // approximation is limited to only ~10% of the total size of files that fully
+  // fall into the keys range. In such case, this helps to avoid a costly
+  // process of binary searching the intersecting files that is required only
+  // for a more precise calculation of the total size.
+
+  autovector<FdWithKeyRange*, 32> first_files;
+  autovector<FdWithKeyRange*, 16> last_files;
+
+  // scan all the levels
+  for (int level = start_level; level < end_level; ++level) {
     const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
-    if (!files_brief.num_files) {
+    if (files_brief.num_files == 0) {
       // empty level, skip exploration
       continue;
     }
 
-    if (!level) {
-      // level 0 data is sorted order, handle the use case explicitly
-      size += ApproximateSizeLevel0(v, files_brief, start, end);
+    if (level == 0) {
+      // level 0 files are not in sorted order, we need to iterate through
+      // the list to compute the total bytes that require scanning,
+      // so handle the case explicitly (similarly to first_files case)
+      for (size_t i = 0; i < files_brief.num_files; i++) {
+        first_files.push_back(&files_brief.files[i]);
+      }
       continue;
     }
 
     assert(level > 0);
     assert(files_brief.num_files > 0);
 
-    // identify the file position for starting key
-    const uint64_t idx_start = FindFileInRange(
-        v->cfd_->internal_comparator(), files_brief, start,
-        /*start=*/0, static_cast<uint32_t>(files_brief.num_files - 1));
-    assert(idx_start < files_brief.num_files);
-
-    // scan all files from the starting position until the ending position
-    // inferred from the sorted order
-    for (uint64_t i = idx_start; i < files_brief.num_files; i++) {
-      uint64_t val;
-      val = ApproximateSize(v, files_brief.files[i], end);
-      if (!val) {
-        // the files after this will not have the range
-        break;
-      }
+    // identify the file position for start key
+    const int idx_start =
+        FindFileInRange(icmp, files_brief, start, 0,
+                        static_cast<uint32_t>(files_brief.num_files - 1));
+    assert(static_cast<size_t>(idx_start) < files_brief.num_files);
 
-      size += val;
+    // identify the file position for end key
+    int idx_end = idx_start;
+    if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
+      idx_end =
+          FindFileInRange(icmp, files_brief, end, idx_start,
+                          static_cast<uint32_t>(files_brief.num_files - 1));
+    }
+    assert(idx_end >= idx_start &&
+           static_cast<size_t>(idx_end) < files_brief.num_files);
 
-      if (i == idx_start) {
-        // subtract the bytes needed to be scanned to get to the starting
-        // key
-        val = ApproximateSize(v, files_brief.files[i], start);
-        assert(size >= val);
-        size -= val;
-      }
+    // scan all files from the starting index to the ending index
+    // (inferred from the sorted order)
+
+    // first scan all the intermediate full files (excluding first and last)
+    for (int i = idx_start + 1; i < idx_end; ++i) {
+      uint64_t file_size = files_brief.files[i].fd.GetFileSize();
+      // The entire file falls into the range, so we can just take its size.
+      assert(file_size ==
+             ApproximateSize(v, files_brief.files[i], start, end, caller));
+      total_full_size += file_size;
+    }
+
+    // save the first and the last files (which may be the same file), so we
+    // can scan them later.
+    first_files.push_back(&files_brief.files[idx_start]);
+    if (idx_start != idx_end) {
+      // we need to estimate size for both files, only if they are different
+      last_files.push_back(&files_brief.files[idx_end]);
     }
   }
 
-  return size;
-}
+  // The sum of all file sizes that intersect the [start, end] keys range.
+  uint64_t total_intersecting_size = 0;
+  for (const auto* file_ptr : first_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
+  for (const auto* file_ptr : last_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
 
-uint64_t VersionSet::ApproximateSizeLevel0(Version* v,
-                                           const LevelFilesBrief& files_brief,
-                                           const Slice& key_start,
-                                           const Slice& key_end) {
-  // level 0 files are not in sorted order, we need to iterate through
-  // the list to compute the total bytes that require scanning
-  uint64_t size = 0;
-  for (size_t i = 0; i < files_brief.num_files; i++) {
-    const uint64_t start = ApproximateSize(v, files_brief.files[i], key_start);
-    const uint64_t end = ApproximateSize(v, files_brief.files[i], key_end);
-    assert(end >= start);
-    size += end - start;
+  // Now scan all the first & last files at each level, and estimate their size.
+  // If the total_intersecting_size is less than X% of the total_full_size - we
+  // want to approximate the result in order to avoid the costly binary search
+  // inside ApproximateSize. We use half of file size as an approximation below.
+
+  const double margin = options.files_size_error_margin;
+  if (margin > 0 && total_intersecting_size <
+                        static_cast<uint64_t>(total_full_size * margin)) {
+    total_full_size += total_intersecting_size / 2;
+  } else {
+    // Estimate for all the first files, at each level
+    for (const auto file_ptr : first_files) {
+      total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
+    }
+
+    // Estimate for all the last files, at each level
+    for (const auto file_ptr : last_files) {
+      // We could use ApproximateSize here, but calling ApproximateOffsetOf
+      // directly is just more efficient.
+      total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
+    }
   }
-  return size;
+
+  return total_full_size;
 }
 
-uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
-                                     const Slice& key) {
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                                         const Slice& key,
+                                         TableReaderCaller caller) {
   // pre-condition
   assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
 
   uint64_t result = 0;
-  if (v->cfd_->internal_comparator().Compare(f.largest_key, key) <= 0) {
+  if (icmp.Compare(f.largest_key, key) <= 0) {
     // Entire file is before "key", so just add the file size
     result = f.fd.GetFileSize();
-  } else if (v->cfd_->internal_comparator().Compare(f.smallest_key, key) > 0) {
+  } else if (icmp.Compare(f.smallest_key, key) > 0) {
     // Entire file is after "key", so ignore
     result = 0;
   } else {
     // "key" falls in the range for this table.  Add the
     // approximate offset of "key" within the table.
-    TableReader* table_reader_ptr;
-    InternalIterator* iter = v->cfd_->table_cache()->NewIterator(
-        ReadOptions(), v->env_options_, v->cfd_->internal_comparator(),
-        *f.file_metadata, nullptr /* range_del_agg */,
-        v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr);
-    if (table_reader_ptr != nullptr) {
-      result = table_reader_ptr->ApproximateOffsetOf(key);
+    TableCache* table_cache = v->cfd_->table_cache();
+    if (table_cache != nullptr) {
+      result = table_cache->ApproximateOffsetOf(
+          key, f.file_metadata->fd, caller, icmp,
+          v->GetMutableCFOptions().prefix_extractor.get());
     }
-    delete iter;
   }
   return result;
 }
 
+uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
+                                     const Slice& start, const Slice& end,
+                                     TableReaderCaller caller) {
+  // pre-condition
+  assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
+  assert(icmp.Compare(start, end) <= 0);
+
+  if (icmp.Compare(f.largest_key, start) <= 0 ||
+      icmp.Compare(f.smallest_key, end) > 0) {
+    // Entire file is before or after the start/end keys range
+    return 0;
+  }
+
+  if (icmp.Compare(f.smallest_key, start) >= 0) {
+    // Start of the range is before the file start - approximate by end offset
+    return ApproximateOffsetOf(v, f, end, caller);
+  }
+
+  if (icmp.Compare(f.largest_key, end) < 0) {
+    // End of the range is after the file end - approximate by subtracting
+    // start offset from the file size
+    uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
+    assert(f.fd.GetFileSize() >= start_offset);
+    return f.fd.GetFileSize() - start_offset;
+  }
+
+  // The interval falls entirely in the range for this file.
+  TableCache* table_cache = v->cfd_->table_cache();
+  if (table_cache == nullptr) {
+    return 0;
+  }
+  return table_cache->ApproximateSize(
+      start, end, f.file_metadata->fd, caller, icmp,
+      v->GetMutableCFOptions().prefix_extractor.get());
+}
+
 void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
   // pre-calculate space requirement
   int64_t total_files = 0;
@@ -4922,10 +5265,12 @@ InternalIterator* VersionSet::MakeInputIterator(
               read_options, env_options_compactions, cfd->internal_comparator(),
               *flevel->files[i].file_metadata, range_del_agg,
               c->mutable_cf_options()->prefix_extractor.get(),
-              nullptr /* table_reader_ptr */,
-              nullptr /* no per level latency histogram */,
-              true /* for_compaction */, nullptr /* arena */,
-              false /* skip_filters */, static_cast<int>(which) /* level */);
+              /*table_reader_ptr=*/nullptr,
+              /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
+              /*arena=*/nullptr,
+              /*skip_filters=*/false, /*level=*/static_cast<int>(which),
+              /*smallest_compaction_key=*/nullptr,
+              /*largest_compaction_key=*/nullptr);
         }
       } else {
         // Create concatenating iterator for the files from this level
@@ -4933,10 +5278,10 @@ InternalIterator* VersionSet::MakeInputIterator(
             cfd->table_cache(), read_options, env_options_compactions,
             cfd->internal_comparator(), c->input_levels(which),
             c->mutable_cf_options()->prefix_extractor.get(),
-            false /* should_sample */,
-            nullptr /* no per level latency histogram */,
-            true /* for_compaction */, false /* skip_filters */,
-            static_cast<int>(which) /* level */, range_del_agg,
+            /*should_sample=*/false,
+            /*no per level latency histogram=*/nullptr,
+            TableReaderCaller::kCompaction, /*skip_filters=*/false,
+            /*level=*/static_cast<int>(which), range_del_agg,
             c->boundaries(which));
       }
     }
@@ -5040,7 +5385,9 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
           assert(!cfd->ioptions()->cf_paths.empty());
           filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
         }
-        filemetadata.name = MakeTableFileName("", file->fd.GetNumber());
+        const uint64_t file_number = file->fd.GetNumber();
+        filemetadata.name = MakeTableFileName("", file_number);
+        filemetadata.file_number = file_number;
         filemetadata.level = level;
         filemetadata.size = static_cast<size_t>(file->fd.GetFileSize());
         filemetadata.smallestkey = file->smallest.user_key().ToString();
@@ -5052,6 +5399,7 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
         filemetadata.being_compacted = file->being_compacted;
         filemetadata.num_entries = file->num_entries;
         filemetadata.num_deletions = file->num_deletions;
+        filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
         metadata->push_back(filemetadata);
       }
     }
@@ -5137,7 +5485,9 @@ ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname,
                                        WriteBufferManager* write_buffer_manager,
                                        WriteController* write_controller)
     : VersionSet(dbname, _db_options, _env_options, table_cache,
-                 write_buffer_manager, write_controller) {}
+                 write_buffer_manager, write_controller,
+                 /*block_cache_tracer=*/nullptr),
+      number_of_edits_to_skip_(0) {}
 
 ReactiveVersionSet::~ReactiveVersionSet() {}
 
@@ -5168,17 +5518,6 @@ Status ReactiveVersionSet::Recover(
   // In recovery, nobody else can access it, so it's fine to set it to be
   // initialized earlier.
   default_cfd->set_initialized();
-
-  bool have_log_number = false;
-  bool have_prev_log_number = false;
-  bool have_next_file = false;
-  bool have_last_sequence = false;
-  uint64_t next_file = 0;
-  uint64_t last_sequence = 0;
-  uint64_t log_number = 0;
-  uint64_t previous_log_number = 0;
-  uint32_t max_column_family = 0;
-  uint64_t min_log_number_to_keep = 0;
   std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
       builders;
   std::unordered_map<int, std::string> column_families_not_found;
@@ -5194,25 +5533,17 @@ Status ReactiveVersionSet::Recover(
   log::Reader* reader = manifest_reader->get();
 
   int retry = 0;
+  VersionEdit version_edit;
   while (s.ok() && retry < 1) {
     assert(reader != nullptr);
     Slice record;
     std::string scratch;
-    while (s.ok() && reader->ReadRecord(&record, &scratch)) {
-      VersionEdit edit;
-      s = edit.DecodeFrom(record);
-      if (!s.ok()) {
-        break;
-      }
-      s = ApplyOneVersionEditToBuilder(
-          edit, cf_name_to_options, column_families_not_found, builders,
-          &have_log_number, &log_number, &have_prev_log_number,
-          &previous_log_number, &have_next_file, &next_file,
-          &have_last_sequence, &last_sequence, &min_log_number_to_keep,
-          &max_column_family);
-    }
+    s = ReadAndRecover(reader, &read_buffer_, cf_name_to_options,
+                       column_families_not_found, builders, &version_edit);
     if (s.ok()) {
-      bool enough = have_next_file && have_log_number && have_last_sequence;
+      bool enough = version_edit.has_next_file_number_ &&
+                    version_edit.has_log_number_ &&
+                    version_edit.has_last_sequence_;
       if (enough) {
         for (const auto& cf : column_families) {
           auto cfd = column_family_set_->GetColumnFamily(cf.name);
@@ -5254,14 +5585,14 @@ Status ReactiveVersionSet::Recover(
   }
 
   if (s.ok()) {
-    if (!have_prev_log_number) {
-      previous_log_number = 0;
+    if (!version_edit.has_prev_log_number_) {
+      version_edit.prev_log_number_ = 0;
     }
-    column_family_set_->UpdateMaxColumnFamily(max_column_family);
+    column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_);
 
-    MarkMinLogNumberToKeep2PC(min_log_number_to_keep);
-    MarkFileNumberUsed(previous_log_number);
-    MarkFileNumberUsed(log_number);
+    MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_);
+    MarkFileNumberUsed(version_edit.prev_log_number_);
+    MarkFileNumberUsed(version_edit.log_number_);
 
     for (auto cfd : *column_family_set_) {
       assert(builders.count(cfd->GetID()) > 0);
@@ -5294,11 +5625,11 @@ Status ReactiveVersionSet::Recover(
                       !(db_options_->skip_stats_update_on_db_open));
       AppendVersion(cfd, v);
     }
-    next_file_number_.store(next_file + 1);
-    last_allocated_sequence_ = last_sequence;
-    last_published_sequence_ = last_sequence;
-    last_sequence_ = last_sequence;
-    prev_log_number_ = previous_log_number;
+    next_file_number_.store(version_edit.next_file_number_ + 1);
+    last_allocated_sequence_ = version_edit.last_sequence_;
+    last_published_sequence_ = version_edit.last_sequence_;
+    last_sequence_ = version_edit.last_sequence_;
+    prev_log_number_ = version_edit.prev_log_number_;
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
         continue;
@@ -5320,17 +5651,7 @@ Status ReactiveVersionSet::ReadAndApply(
   mu->AssertHeld();
 
   Status s;
-  bool have_log_number = false;
-  bool have_prev_log_number = false;
-  bool have_next_file = false;
-  bool have_last_sequence = false;
-  uint64_t next_file = 0;
-  uint64_t last_sequence = 0;
-  uint64_t log_number = 0;
-  uint64_t previous_log_number = 0;
-  uint32_t max_column_family = 0;
-  uint64_t min_log_number_to_keep = 0;
-
+  uint64_t applied_edits = 0;
   while (s.ok()) {
     Slice record;
     std::string scratch;
@@ -5342,73 +5663,50 @@ Status ReactiveVersionSet::ReadAndApply(
       if (!s.ok()) {
         break;
       }
-      ColumnFamilyData* cfd =
-          column_family_set_->GetColumnFamily(edit.column_family_);
-      // If we cannot find this column family in our column family set, then it
-      // may be a new column family created by the primary after the secondary
-      // starts. Ignore it for now.
-      if (nullptr == cfd) {
+
+      // Skip the first VersionEdits of each MANIFEST generated by
+      // VersionSet::WriteCurrentStatetoManifest.
+      if (number_of_edits_to_skip_ > 0) {
+        ColumnFamilyData* cfd =
+            column_family_set_->GetColumnFamily(edit.column_family_);
+        if (cfd != nullptr && !cfd->IsDropped()) {
+          --number_of_edits_to_skip_;
+        }
         continue;
       }
-      if (active_version_builders_.find(edit.column_family_) ==
-          active_version_builders_.end()) {
-        std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
-            new BaseReferencedVersionBuilder(cfd));
-        active_version_builders_.insert(
-            std::make_pair(edit.column_family_, std::move(builder_guard)));
-      }
-      s = ApplyOneVersionEditToBuilder(
-          edit, &have_log_number, &log_number, &have_prev_log_number,
-          &previous_log_number, &have_next_file, &next_file,
-          &have_last_sequence, &last_sequence, &min_log_number_to_keep,
-          &max_column_family);
+
+      s = read_buffer_.AddEdit(&edit);
       if (!s.ok()) {
         break;
       }
-      auto builder_iter = active_version_builders_.find(edit.column_family_);
-      assert(builder_iter != active_version_builders_.end());
-      auto builder = builder_iter->second->version_builder();
-      assert(builder != nullptr);
-      s = builder->LoadTableHandlers(
-          cfd->internal_stats(), db_options_->max_file_opening_threads,
-          false /* prefetch_index_and_filter_in_cache */,
-          false /* is_initial_load */,
-          cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
-      TEST_SYNC_POINT_CALLBACK(
-          "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", &s);
-      if (!s.ok() && !s.IsPathNotFound()) {
-        break;
-      } else if (s.IsPathNotFound()) {
-        s = Status::OK();
-      } else {  // s.ok() == true
-        auto version = new Version(cfd, this, env_options_,
-                                   *cfd->GetLatestMutableCFOptions(),
-                                   current_version_number_++);
-        builder->SaveTo(version->storage_info());
-        version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
-        AppendVersion(cfd, version);
-        active_version_builders_.erase(builder_iter);
-        if (cfds_changed->count(cfd) == 0) {
-          cfds_changed->insert(cfd);
+      VersionEdit temp_edit;
+      if (edit.is_in_atomic_group_) {
+        if (read_buffer_.IsFull()) {
+          // Apply edits in an atomic group when we have read all edits in the
+          // group.
+          for (auto& e : read_buffer_.replay_buffer()) {
+            s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit);
+            if (!s.ok()) {
+              break;
+            }
+            applied_edits++;
+          }
+          if (!s.ok()) {
+            break;
+          }
+          read_buffer_.Clear();
+        }
+      } else {
+        // Apply a normal edit immediately.
+        s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit);
+        if (s.ok()) {
+          applied_edits++;
         }
       }
-      if (have_next_file) {
-        next_file_number_.store(next_file + 1);
-      }
-      if (have_last_sequence) {
-        last_allocated_sequence_ = last_sequence;
-        last_published_sequence_ = last_sequence;
-        last_sequence_ = last_sequence;
-      }
-      if (have_prev_log_number) {
-        prev_log_number_ = previous_log_number;
-        MarkFileNumberUsed(previous_log_number);
-      }
-      if (have_log_number) {
-        MarkFileNumberUsed(log_number);
-      }
-      column_family_set_->UpdateMaxColumnFamily(max_column_family);
-      MarkMinLogNumberToKeep2PC(min_log_number_to_keep);
+    }
+    if (!s.ok()) {
+      // Clear the buffer if we fail to decode/apply an edit.
+      read_buffer_.Clear();
     }
     // It's possible that:
     // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted.
@@ -5418,8 +5716,37 @@ Status ReactiveVersionSet::ReadAndApply(
     // find the next MANIFEST, we should exit the loop.
     s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
     reader = manifest_reader->get();
-    if (s.ok() && reader->file()->file_name() == old_manifest_path) {
-      break;
+    if (s.ok()) {
+      if (reader->file()->file_name() == old_manifest_path) {
+        // Still processing the same MANIFEST, thus no need to continue this
+        // loop since no record is available if we have reached here.
+        break;
+      } else {
+        // We have switched to a new MANIFEST whose first records have been
+        // generated by VersionSet::WriteCurrentStatetoManifest. Since the
+        // secondary instance has already finished recovering upon start, there
+        // is no need for the secondary to process these records. Actually, if
+        // the secondary were to replay these records, the secondary may end up
+        // adding the same SST files AGAIN to each column family, causing
+        // consistency checks done by VersionBuilder to fail. Therefore, we
+        // record the number of records to skip at the beginning of the new
+        // MANIFEST and ignore them.
+        number_of_edits_to_skip_ = 0;
+        for (auto* cfd : *column_family_set_) {
+          if (cfd->IsDropped()) {
+            continue;
+          }
+          // Increase number_of_edits_to_skip by 2 because
+          // WriteCurrentStatetoManifest() writes 2 version edits for each
+          // column family at the beginning of the newly-generated MANIFEST.
+          // TODO(yanqin) remove hard-coded value.
+          if (db_options_->write_dbid_to_manifest) {
+            number_of_edits_to_skip_ += 3;
+          } else {
+            number_of_edits_to_skip_ += 2;
+          }
+        }
+      }
     }
   }
 
@@ -5437,52 +5764,112 @@ Status ReactiveVersionSet::ReadAndApply(
       }
     }
   }
+  TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits",
+                           &applied_edits);
   return s;
 }
 
 Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
-    VersionEdit& edit, bool* have_log_number, uint64_t* log_number,
-    bool* have_prev_log_number, uint64_t* previous_log_number,
-    bool* have_next_file, uint64_t* next_file, bool* have_last_sequence,
-    SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep,
-    uint32_t* max_column_family) {
-  ColumnFamilyData* cfd = nullptr;
-  Status status;
+    VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    VersionEdit* version_edit) {
+  ColumnFamilyData* cfd =
+      column_family_set_->GetColumnFamily(edit.column_family_);
+
+  // If we cannot find this column family in our column family set, then it
+  // may be a new column family created by the primary after the secondary
+  // starts. It is also possible that the secondary instance opens only a subset
+  // of column families. Ignore it for now.
+  if (nullptr == cfd) {
+    return Status::OK();
+  }
+  if (active_version_builders_.find(edit.column_family_) ==
+          active_version_builders_.end() &&
+      !cfd->IsDropped()) {
+    std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
+        new BaseReferencedVersionBuilder(cfd));
+    active_version_builders_.insert(
+        std::make_pair(edit.column_family_, std::move(builder_guard)));
+  }
+
+  auto builder_iter = active_version_builders_.find(edit.column_family_);
+  assert(builder_iter != active_version_builders_.end());
+  auto builder = builder_iter->second->version_builder();
+  assert(builder != nullptr);
+
   if (edit.is_column_family_add_) {
     // TODO (yanqin) for now the secondary ignores column families created
     // after Open. This also simplifies handling of switching to a new MANIFEST
     // and processing the snapshot of the system at the beginning of the
     // MANIFEST.
-    return Status::OK();
   } else if (edit.is_column_family_drop_) {
-    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-    // Drop a CF created by primary after secondary starts? Then ignore
-    if (cfd == nullptr) {
-      return Status::OK();
-    }
     // Drop the column family by setting it to be 'dropped' without destroying
     // the column family handle.
+    // TODO (haoyu) figure out how to handle column faimly drop for
+    // secondary instance. (Is it possible that the ref count for cfd is 0 but
+    // the ref count for its versions is higher than 0?)
     cfd->SetDropped();
     if (cfd->Unref()) {
       delete cfd;
       cfd = nullptr;
     }
+    active_version_builders_.erase(builder_iter);
   } else {
-    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-    // Operation on a CF created after Open? Then ignore
-    if (cfd == nullptr) {
-      return Status::OK();
+    Status s = builder->Apply(&edit);
+    if (!s.ok()) {
+      return s;
     }
-    auto builder_iter = active_version_builders_.find(edit.column_family_);
-    assert(builder_iter != active_version_builders_.end());
-    auto builder = builder_iter->second->version_builder();
-    assert(builder != nullptr);
-    builder->Apply(&edit);
   }
-  return ExtractInfoFromVersionEdit(
-      cfd, edit, have_log_number, log_number, have_prev_log_number,
-      previous_log_number, have_next_file, next_file, have_last_sequence,
-      last_sequence, min_log_number_to_keep, max_column_family);
+  Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (cfd != nullptr && !cfd->IsDropped()) {
+    s = builder->LoadTableHandlers(
+        cfd->internal_stats(), db_options_->max_file_opening_threads,
+        false /* prefetch_index_and_filter_in_cache */,
+        false /* is_initial_load */,
+        cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+    TEST_SYNC_POINT_CALLBACK(
+        "ReactiveVersionSet::ApplyOneVersionEditToBuilder:"
+        "AfterLoadTableHandlers",
+        &s);
+
+    if (s.ok()) {
+      auto version = new Version(cfd, this, env_options_,
+                                 *cfd->GetLatestMutableCFOptions(),
+                                 current_version_number_++);
+      builder->SaveTo(version->storage_info());
+      version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
+      AppendVersion(cfd, version);
+      active_version_builders_.erase(builder_iter);
+      if (cfds_changed->count(cfd) == 0) {
+        cfds_changed->insert(cfd);
+      }
+    } else if (s.IsPathNotFound()) {
+      s = Status::OK();
+    }
+    // Some other error has occurred during LoadTableHandlers.
+  }
+
+  if (version_edit->has_next_file_number()) {
+    next_file_number_.store(version_edit->next_file_number_ + 1);
+  }
+  if (version_edit->has_last_sequence_) {
+    last_allocated_sequence_ = version_edit->last_sequence_;
+    last_published_sequence_ = version_edit->last_sequence_;
+    last_sequence_ = version_edit->last_sequence_;
+  }
+  if (version_edit->has_prev_log_number_) {
+    prev_log_number_ = version_edit->prev_log_number_;
+    MarkFileNumberUsed(version_edit->prev_log_number_);
+  }
+  if (version_edit->has_log_number_) {
+    MarkFileNumberUsed(version_edit->log_number_);
+  }
+  column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_);
+  MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_);
+  return s;
 }
 
 Status ReactiveVersionSet::MaybeSwitchManifest(
@@ -5492,7 +5879,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
   Status s;
   do {
     std::string manifest_path;
-    s = GetCurrentManifestPath(&manifest_path);
+    s = GetCurrentManifestPath(dbname_, env_, &manifest_path,
+                               &manifest_file_number_);
     std::unique_ptr<SequentialFile> manifest_file;
     if (s.ok()) {
       if (nullptr == manifest_reader->get() ||
@@ -5514,7 +5902,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
     std::unique_ptr<SequentialFileReader> manifest_file_reader;
     if (s.ok()) {
       manifest_file_reader.reset(
-          new SequentialFileReader(std::move(manifest_file), manifest_path));
+          new SequentialFileReader(std::move(manifest_file), manifest_path,
+                                   db_options_->log_readahead_size));
       manifest_reader->reset(new log::FragmentBufferedReader(
           nullptr, std::move(manifest_file_reader), reporter,
           true /* checksum */, 0 /* log_number */));
@@ -5523,8 +5912,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
       // TODO (yanqin) every time we switch to a new MANIFEST, we clear the
       // active_version_builders_ map because we choose to construct the
       // versions from scratch, thanks to the first part of each MANIFEST
-      // written by VersionSet::WriteSnapshot. This is not necessary, but we
-      // choose this at present for the sake of simplicity.
+      // written by VersionSet::WriteCurrentStatetoManifest. This is not
+      // necessary, but we choose this at present for the sake of simplicity.
       active_version_builders_.clear();
     }
   } while (s.IsPathNotFound());
diff --git a/db/version_set.h b/db/version_set.h
index d82c5b47291..758bd5e5d32 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -29,8 +29,8 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/compaction.h"
-#include "db/compaction_picker.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker.h"
 #include "db/dbformat.h"
 #include "db/file_indexer.h"
 #include "db/log_reader.h"
@@ -46,6 +46,7 @@
 #include "rocksdb/env.h"
 #include "table/get_context.h"
 #include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
 
 namespace rocksdb {
 
@@ -62,9 +63,16 @@ class VersionSet;
 class WriteBufferManager;
 class MergeContext;
 class ColumnFamilySet;
-class TableCache;
 class MergeIteratorBuilder;
 
+// VersionEdit is always supposed to be valid and it is used to point at
+// entries in Manifest. Ideally it should not be used as a container to
+// carry around few of its fields as function params because it can cause
+// readers to think it's a valid entry from Manifest. To avoid that confusion
+// introducing VersionEditParams to simply carry around multiple VersionEdit
+// params. It need not point to a valid record in Manifest.
+using VersionEditParams = VersionEdit;
+
 // Return the smallest index i such that file_level.files[i]->largest >= key.
 // Return file_level.num_files if there is no such file.
 // REQUIRES: "file_level.files" contains a sorted list of
@@ -91,6 +99,9 @@ extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
                                       const std::vector<FileMetaData*>& files,
                                       Arena* arena);
 
+// Information of the storage associated with each Version, including number of
+// levels of LSM tree, files information at each level, files marked for
+// compaction, etc.
 class VersionStorageInfo {
  public:
   VersionStorageInfo(const InternalKeyComparator* internal_comparator,
@@ -98,6 +109,9 @@ class VersionStorageInfo {
                      CompactionStyle compaction_style,
                      VersionStorageInfo* src_vstorage,
                      bool _force_consistency_checks);
+  // No copying allowed
+  VersionStorageInfo(const VersionStorageInfo&) = delete;
+  void operator=(const VersionStorageInfo&) = delete;
   ~VersionStorageInfo();
 
   void Reserve(int level, size_t size) { files_[level].reserve(size); }
@@ -298,6 +312,10 @@ class VersionStorageInfo {
     return files_marked_for_periodic_compaction_;
   }
 
+  void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
+    files_marked_for_periodic_compaction_.emplace_back(level, f);
+  }
+
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
   // REQUIRES: DB mutex held during access
   const autovector<std::pair<int, FileMetaData*>>&
@@ -531,12 +549,11 @@ class VersionStorageInfo {
 
   friend class Version;
   friend class VersionSet;
-  // No copying allowed
-  VersionStorageInfo(const VersionStorageInfo&) = delete;
-  void operator=(const VersionStorageInfo&) = delete;
 };
 
 using MultiGetRange = MultiGetContext::Range;
+// A column family's version consists of the SST files owned by the column
+// family at a certain point in time.
 class Version {
  public:
   // Append to *iters a sequence of iterators that will
@@ -555,28 +572,33 @@ class Version {
                                   const Slice& largest_user_key,
                                   int level, bool* overlap);
 
-  // Lookup the value for key.  If found, store it in *val and
-  // return OK.  Else return a non-OK status.
-  // Uses *operands to store merge_operator operations to apply later.
+  // Lookup the value for key or get all merge operands for key.
+  // If do_merge = true (default) then lookup value for key.
+  // Behavior if do_merge = true:
+  //    If found, store it in *value and
+  //    return OK.  Else return a non-OK status.
+  //    Uses *operands to store merge_operator operations to apply later.
   //
-  // If the ReadOptions.read_tier is set to do a read-only fetch, then
-  // *value_found will be set to false if it cannot be determined whether
-  // this value exists without doing IO.
+  //    If the ReadOptions.read_tier is set to do a read-only fetch, then
+  //    *value_found will be set to false if it cannot be determined whether
+  //    this value exists without doing IO.
   //
-  // If the key is Deleted, *status will be set to NotFound and
+  //    If the key is Deleted, *status will be set to NotFound and
   //                        *key_exists will be set to true.
-  // If no key was found, *status will be set to NotFound and
+  //    If no key was found, *status will be set to NotFound and
   //                      *key_exists will be set to false.
-  // If seq is non-null, *seq will be set to the sequence number found
-  // for the key if a key was found.
-  //
+  //    If seq is non-null, *seq will be set to the sequence number found
+  //    for the key if a key was found.
+  // Behavior if do_merge = false
+  //    If the key has any merge operands then store them in
+  //    merge_context.operands_list and don't merge the operands
   // REQUIRES: lock is not held
   void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
            Status* status, MergeContext* merge_context,
            SequenceNumber* max_covering_tombstone_seq,
            bool* value_found = nullptr, bool* key_exists = nullptr,
            SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
-           bool* is_blob = nullptr);
+           bool* is_blob = nullptr, bool do_merge = true);
 
   void MultiGet(const ReadOptions&, MultiGetRange* range,
                 ReadCallback* callback = nullptr, bool* is_blob = nullptr);
@@ -620,6 +642,11 @@ class Version {
   Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
                                       TablePropertiesCollection* props) const;
 
+  // Print summary of range delete tombstones in SST files into out_str,
+  // with maximum max_entries_to_print entries printed out.
+  Status TablesRangeTombstoneSummary(int max_entries_to_print,
+                                     std::string* out_str);
+
   // REQUIRES: lock is held
   // On success, "tp" will contains the aggregated table property among
   // the table properties of all sst files in this version.
@@ -649,7 +676,11 @@ class Version {
 
   uint64_t GetSstFilesSize();
 
-  MutableCFOptions GetMutableCFOptions() { return mutable_cf_options_; }
+  // Retrieves the file_creation_time of the oldest file in the DB.
+  // Prerequisite for this API is max_open_files = -1
+  void GetCreationTimeOfOldestFile(uint64_t* creation_time);
+
+  const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
 
  private:
   Env* env_;
@@ -711,8 +742,8 @@ class Version {
   ~Version();
 
   // No copying allowed
-  Version(const Version&);
-  void operator=(const Version&);
+  Version(const Version&) = delete;
+  void operator=(const Version&) = delete;
 };
 
 struct ObsoleteFileInfo {
@@ -747,12 +778,37 @@ struct ObsoleteFileInfo {
 
 class BaseReferencedVersionBuilder;
 
+class AtomicGroupReadBuffer {
+ public:
+  Status AddEdit(VersionEdit* edit);
+  void Clear();
+  bool IsFull() const;
+  bool IsEmpty() const;
+
+  uint64_t TEST_read_edits_in_atomic_group() const {
+    return read_edits_in_atomic_group_;
+  }
+  std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
+
+ private:
+  uint64_t read_edits_in_atomic_group_ = 0;
+  std::vector<VersionEdit> replay_buffer_;
+};
+
+// VersionSet is the collection of versions of all the column families of the
+// database. Each database owns one VersionSet. A VersionSet has access to all
+// column families via ColumnFamilySet, i.e. set of the column families.
 class VersionSet {
  public:
   VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
              const EnvOptions& env_options, Cache* table_cache,
              WriteBufferManager* write_buffer_manager,
-             WriteController* write_controller);
+             WriteController* write_controller,
+             BlockCacheTracer* const block_cache_tracer);
+  // No copying allowed
+  VersionSet(const VersionSet&) = delete;
+  void operator=(const VersionSet&) = delete;
+
   virtual ~VersionSet();
 
   // Apply *edit to the current version to form a new descriptor that
@@ -807,13 +863,15 @@ class VersionSet {
       bool new_descriptor_log = false,
       const ColumnFamilyOptions* new_cf_options = nullptr);
 
-  Status GetCurrentManifestPath(std::string* manifest_filename);
+  static Status GetCurrentManifestPath(const std::string& dbname, Env* env,
+                                       std::string* manifest_filename,
+                                       uint64_t* manifest_file_number);
 
   // Recover the last saved descriptor from persistent storage.
   // If read_only == true, Recover() will not complain if some column families
   // are not opened
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
-                 bool read_only = false);
+                 bool read_only = false, std::string* db_id = nullptr);
 
   // Reads a manifest file and returns a list of column families in
   // column_families.
@@ -952,10 +1010,12 @@ class VersionSet {
   void AddLiveFiles(std::vector<FileDescriptor>* live_list);
 
   // Return the approximate size of data to be scanned for range [start, end)
-  // in levels [start_level, end_level). If end_level == 0 it will search
+  // in levels [start_level, end_level). If end_level == -1 it will search
   // through all non-empty levels
-  uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
-                           int start_level = 0, int end_level = -1);
+  uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
+                           const Slice& start, const Slice& end,
+                           int start_level, int end_level,
+                           TableReaderCaller caller);
 
   // Return the size of the current manifest file
   uint64_t manifest_file_size() const { return manifest_file_size_; }
@@ -1003,21 +1063,33 @@ class VersionSet {
     }
   };
 
-  // ApproximateSize helper
-  uint64_t ApproximateSizeLevel0(Version* v, const LevelFilesBrief& files_brief,
-                                 const Slice& start, const Slice& end);
+  // Returns approximated offset of a key in a file for a given version.
+  uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                               const Slice& key, TableReaderCaller caller);
 
+  // Returns approximated data size between start and end keys in a file
+  // for a given version.
   uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
-                           const Slice& key);
+                           const Slice& start, const Slice& end,
+                           TableReaderCaller caller);
 
   // Save current contents to *log
-  Status WriteSnapshot(log::Writer* log);
+  Status WriteCurrentStateToManifest(log::Writer* log);
 
   void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
 
   ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                        VersionEdit* edit);
 
+  Status ReadAndRecover(
+      log::Reader* reader, AtomicGroupReadBuffer* read_buffer,
+      const std::unordered_map<std::string, ColumnFamilyOptions>&
+          name_to_options,
+      std::unordered_map<int, std::string>& column_families_not_found,
+      std::unordered_map<
+          uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
+      VersionEditParams* version_edit, std::string* db_id = nullptr);
+
   // REQUIRES db mutex
   Status ApplyOneVersionEditToBuilder(
       VersionEdit& edit,
@@ -1025,22 +1097,17 @@ class VersionSet {
       std::unordered_map<int, std::string>& column_families_not_found,
       std::unordered_map<
           uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
-      bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
-      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-      bool* have_last_sequence, SequenceNumber* last_sequence,
-      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
-
-  Status ExtractInfoFromVersionEdit(
-      ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number,
-      uint64_t* log_number, bool* have_prev_log_number,
-      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
-      bool* have_last_sequence, SequenceNumber* last_sequence,
-      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
+      VersionEditParams* version_edit);
+
+  Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+                                    const VersionEdit& from_edit,
+                                    VersionEditParams* version_edit_params);
 
   std::unique_ptr<ColumnFamilySet> column_family_set_;
 
   Env* const env_;
   const std::string dbname_;
+  std::string db_id_;
   const ImmutableDBOptions* const db_options_;
   std::atomic<uint64_t> next_file_number_;
   // Any log number equal or lower than this should be ignored during recovery,
@@ -1085,11 +1152,9 @@ class VersionSet {
   // env options for all reads and writes except compactions
   EnvOptions env_options_;
 
- private:
-  // No copying allowed
-  VersionSet(const VersionSet&);
-  void operator=(const VersionSet&);
+  BlockCacheTracer* const block_cache_tracer_;
 
+ private:
   // REQUIRES db mutex at beginning. may release and re-acquire db mutex
   Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
                                InstrumentedMutex* mu, Directory* db_directory,
@@ -1097,10 +1162,14 @@ class VersionSet {
                                const ColumnFamilyOptions* new_cf_options);
 
   void LogAndApplyCFHelper(VersionEdit* edit);
-  void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
-                         VersionEdit* edit, InstrumentedMutex* mu);
+  Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
+                           VersionEdit* edit, InstrumentedMutex* mu);
 };
 
+// ReactiveVersionSet represents a collection of versions of the column
+// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
+// need to replay the MANIFEST (description log in older terms) in order to
+// reconstruct and install versions.
 class ReactiveVersionSet : public VersionSet {
  public:
   ReactiveVersionSet(const std::string& dbname,
@@ -1121,16 +1190,20 @@ class ReactiveVersionSet : public VersionSet {
                  std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
                  std::unique_ptr<Status>* manifest_reader_status);
 
+  uint64_t TEST_read_edits_in_atomic_group() const {
+    return read_buffer_.TEST_read_edits_in_atomic_group();
+  }
+  std::vector<VersionEdit>& replay_buffer() {
+    return read_buffer_.replay_buffer();
+  }
+
  protected:
   using VersionSet::ApplyOneVersionEditToBuilder;
 
   // REQUIRES db mutex
   Status ApplyOneVersionEditToBuilder(
-      VersionEdit& edit, bool* have_log_number, uint64_t* log_number,
-      bool* have_prev_log_number, uint64_t* previous_log_number,
-      bool* have_next_file, uint64_t* next_file, bool* have_last_sequence,
-      SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep,
-      uint32_t* max_column_family);
+      VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      VersionEdit* version_edit);
 
   Status MaybeSwitchManifest(
       log::Reader::Reporter* reporter,
@@ -1139,6 +1212,10 @@ class ReactiveVersionSet : public VersionSet {
  private:
   std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
       active_version_builders_;
+  AtomicGroupReadBuffer read_buffer_;
+  // Number of version edits to skip by ReadAndApply at the beginning of a new
+  // MANIFEST created by primary.
+  int number_of_edits_to_skip_;
 
   using VersionSet::LogAndApply;
   using VersionSet::Recover;
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 43924a3addd..66ad930f583 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -8,12 +8,13 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_set.h"
+#include "db/db_impl/db_impl.h"
 #include "db/log_writer.h"
+#include "logging/logging.h"
 #include "table/mock_table.h"
-#include "util/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -34,10 +35,12 @@ class GenerateLevelFilesBriefTest : public testing::Test {
   void Add(const char* smallest, const char* largest,
            SequenceNumber smallest_seq = 100,
            SequenceNumber largest_seq = 100) {
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(files_.size() + 1, 0, 0);
-    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
-    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    FileMetaData* f = new FileMetaData(
+        files_.size() + 1, 0, 0,
+        InternalKey(smallest, smallest_seq, kTypeValue),
+        InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+        largest_seq, /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime);
     files_.push_back(f);
   }
 
@@ -128,28 +131,24 @@ class VersionStorageInfoTest : public testing::Test {
   void Add(int level, uint32_t file_number, const char* smallest,
            const char* largest, uint64_t file_size = 0) {
     assert(level < vstorage_.num_levels());
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(file_number, 0, file_size);
-    f->smallest = GetInternalKey(smallest, 0);
-    f->largest = GetInternalKey(largest, 0);
+    FileMetaData* f = new FileMetaData(
+        file_number, 0, file_size, GetInternalKey(smallest, 0),
+        GetInternalKey(largest, 0), /* smallest_seq */ 0, /* largest_seq */ 0,
+        /* marked_for_compact */ false, kInvalidBlobFileNumber,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime);
     f->compensated_file_size = file_size;
-    f->refs = 0;
-    f->num_entries = 0;
-    f->num_deletions = 0;
     vstorage_.AddFile(level, f);
   }
 
   void Add(int level, uint32_t file_number, const InternalKey& smallest,
            const InternalKey& largest, uint64_t file_size = 0) {
     assert(level < vstorage_.num_levels());
-    FileMetaData* f = new FileMetaData;
-    f->fd = FileDescriptor(file_number, 0, file_size);
-    f->smallest = smallest;
-    f->largest = largest;
+    FileMetaData* f = new FileMetaData(
+        file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
+        /* largest_seq */ 0, /* marked_for_compact */ false,
+        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime);
     f->compensated_file_size = file_size;
-    f->refs = 0;
-    f->num_entries = 0;
-    f->num_deletions = 0;
     vstorage_.AddFile(level, f);
   }
 
@@ -607,6 +606,7 @@ class VersionSetTestBase {
   const static std::string kColumnFamilyName1;
   const static std::string kColumnFamilyName2;
   const static std::string kColumnFamilyName3;
+  int num_initial_edits_;
 
   VersionSetTestBase()
       : env_(Env::Default()),
@@ -617,7 +617,11 @@ class VersionSetTestBase {
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
                                  table_cache_.get(), &write_buffer_manager_,
-                                 &write_controller_)),
+                                 &write_controller_,
+                                 /*block_cache_tracer=*/nullptr)),
+        reactive_versions_(std::make_shared<ReactiveVersionSet>(
+            dbname_, &db_options_, env_options_, table_cache_.get(),
+            &write_buffer_manager_, &write_controller_)),
         shutting_down_(false),
         mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
@@ -632,6 +636,12 @@ class VersionSetTestBase {
     assert(last_seqno != nullptr);
     assert(log_writer != nullptr);
     VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
@@ -653,7 +663,7 @@ class VersionSetTestBase {
       new_cfs.emplace_back(new_cf);
     }
     *last_seqno = last_seq;
-
+    num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
     const std::string manifest = DescriptorFileName(dbname_, 1);
     std::unique_ptr<WritableFile> file;
     Status s = env_->NewWritableFile(
@@ -686,7 +696,7 @@ class VersionSetTestBase {
     std::vector<ColumnFamilyDescriptor> column_families;
     SequenceNumber last_seqno;
     std::unique_ptr<log::Writer> log_writer;
-
+    SetIdentityFile(env_, dbname_);
     PrepareManifest(&column_families, &last_seqno, &log_writer);
     log_writer.reset();
     // Make "CURRENT" file point to the new manifest file.
@@ -708,6 +718,7 @@ class VersionSetTestBase {
   WriteController write_controller_;
   WriteBufferManager write_buffer_manager_;
   std::shared_ptr<VersionSet> versions_;
+  std::shared_ptr<ReactiveVersionSet> reactive_versions_;
   InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
@@ -746,7 +757,7 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
   SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
         uint32_t* cf_id = reinterpret_cast<uint32_t*>(arg);
-        EXPECT_EQ(0, *cf_id);
+        EXPECT_EQ(0u, *cf_id);
         ++count;
       });
   SyncPoint::GetInstance()->EnableProcessing();
@@ -758,216 +769,388 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
   EXPECT_EQ(kGroupSize - 1, count);
 }
 
-TEST_F(VersionSetTest, HandleValidAtomicGroup) {
-  std::vector<ColumnFamilyDescriptor> column_families;
-  SequenceNumber last_seqno;
-  std::unique_ptr<log::Writer> log_writer;
-  PrepareManifest(&column_families, &last_seqno, &log_writer);
+class VersionSetAtomicGroupTest : public VersionSetTestBase,
+                                  public testing::Test {
+ public:
+  VersionSetAtomicGroupTest() : VersionSetTestBase() {}
 
-  // Append multiple version edits that form an atomic group
-  const int kAtomicGroupSize = 3;
-  std::vector<VersionEdit> edits(kAtomicGroupSize);
-  int remaining = kAtomicGroupSize;
-  for (size_t i = 0; i != edits.size(); ++i) {
-    edits[i].SetLogNumber(0);
-    edits[i].SetNextFile(2);
-    edits[i].MarkAtomicGroup(--remaining);
-    edits[i].SetLastSequence(last_seqno++);
-  }
-  Status s;
-  for (const auto& edit : edits) {
-    std::string record;
-    edit.EncodeTo(&record);
-    s = log_writer->AddRecord(record);
-    ASSERT_OK(s);
+  void SetUp() override {
+    PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+    SetupTestSyncPoints();
   }
-  log_writer.reset();
 
-  s = SetCurrentFile(env_, dbname_, 1, nullptr);
-  ASSERT_OK(s);
-
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
+  void SetupValidAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
 
-  bool first_in_atomic_group = false;
-  bool last_in_atomic_group = false;
+  void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
 
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits.front().DebugString(),
-                  e->DebugString());  // compare based on value
-        first_in_atomic_group = true;
-      });
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:LastInAtomicGroup", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits.back().DebugString(),
-                  e->DebugString());  // compare based on value
-        EXPECT_TRUE(first_in_atomic_group);
-        last_in_atomic_group = true;
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
+  void SetupCorruptedAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != ((size_t)atomic_group_size / 2)) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
 
-  EXPECT_OK(versions_->Recover(column_families, false));
-  EXPECT_EQ(column_families.size(),
-            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
-  EXPECT_TRUE(first_in_atomic_group);
-  EXPECT_TRUE(last_in_atomic_group);
-}
+  void SetupIncorrectAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != 1) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      } else {
+        edits_[i].MarkAtomicGroup(remaining--);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(env_, dbname_, 1, nullptr));
+  }
 
-TEST_F(VersionSetTest, HandleIncompleteTrailingAtomicGroup) {
-  std::vector<ColumnFamilyDescriptor> column_families;
-  SequenceNumber last_seqno;
-  std::unique_ptr<log::Writer> log_writer;
-  PrepareManifest(&column_families, &last_seqno, &log_writer);
+  void SetupTestSyncPoints() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.front().DebugString(),
+                    e->DebugString());  // compare based on value
+          first_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.back().DebugString(),
+                    e->DebugString());  // compare based on value
+          EXPECT_TRUE(first_in_atomic_group_);
+          last_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) {
+          num_recovered_edits_ = *reinterpret_cast<int*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "ReactiveVersionSet::ReadAndApply:AppliedEdits",
+        [&](void* arg) { num_applied_edits_ = *reinterpret_cast<int*>(arg); });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroup",
+        [&](void* /* arg */) { ++num_edits_in_atomic_group_; });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits",
+        [&](void* arg) {
+          corrupted_edit_ = *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize",
+        [&](void* arg) {
+          edit_with_incorrect_group_size_ =
+              *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
 
-  // Append multiple version edits that form an atomic group
-  const int kAtomicGroupSize = 4;
-  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
-  std::vector<VersionEdit> edits(kNumberOfPersistedVersionEdits);
-  int remaining = kAtomicGroupSize;
-  for (size_t i = 0; i != edits.size(); ++i) {
-    edits[i].SetLogNumber(0);
-    edits[i].SetNextFile(2);
-    edits[i].MarkAtomicGroup(--remaining);
-    edits[i].SetLastSequence(last_seqno++);
+  void AddNewEditsToLog(int num_edits) {
+    for (int i = 0; i < num_edits; i++) {
+      std::string record;
+      edits_[i].EncodeTo(&record);
+      ASSERT_OK(log_writer_->AddRecord(record));
+    }
   }
-  Status s;
-  for (const auto& edit : edits) {
-    std::string record;
-    edit.EncodeTo(&record);
-    s = log_writer->AddRecord(record);
-    ASSERT_OK(s);
+
+  void TearDown() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    log_writer_.reset();
   }
-  log_writer.reset();
 
-  s = SetCurrentFile(env_, dbname_, 1, nullptr);
-  ASSERT_OK(s);
+ protected:
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  SequenceNumber last_seqno_;
+  std::vector<VersionEdit> edits_;
+  bool first_in_atomic_group_ = false;
+  bool last_in_atomic_group_ = false;
+  int num_edits_in_atomic_group_ = 0;
+  int num_recovered_edits_ = 0;
+  int num_applied_edits_ = 0;
+  VersionEdit corrupted_edit_;
+  VersionEdit edit_with_incorrect_group_size_;
+  std::unique_ptr<log::Writer> log_writer_;
+};
 
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
+TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
 
-  bool first_in_atomic_group = false;
-  bool last_in_atomic_group = false;
-  size_t num = 0;
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
 
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits.front().DebugString(),
-                  e->DebugString());  // compare based on value
-        first_in_atomic_group = true;
-      });
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:LastInAtomicGroup",
-      [&](void* /* arg */) { last_in_atomic_group = true; });
-  SyncPoint::GetInstance()->SetCallBack("VersionSet::Recover:AtomicGroup",
-                                        [&](void* /* arg */) { ++num; });
-  SyncPoint::GetInstance()->EnableProcessing();
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  AddNewEditsToLog(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+}
 
-  EXPECT_OK(versions_->Recover(column_families, false));
-  EXPECT_EQ(column_families.size(),
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
             versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
-  EXPECT_TRUE(first_in_atomic_group);
-  EXPECT_FALSE(last_in_atomic_group);
-  EXPECT_EQ(kNumberOfPersistedVersionEdits, num);
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
 }
 
-TEST_F(VersionSetTest, HandleCorruptedAtomicGroup) {
-  std::vector<ColumnFamilyDescriptor> column_families;
-  SequenceNumber last_seqno;
-  std::unique_ptr<log::Writer> log_writer;
-  PrepareManifest(&column_families, &last_seqno, &log_writer);
-
-  // Append multiple version edits that form an atomic group
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) {
   const int kAtomicGroupSize = 4;
-  std::vector<VersionEdit> edits(kAtomicGroupSize);
-  int remaining = kAtomicGroupSize;
-  for (size_t i = 0; i != edits.size(); ++i) {
-    edits[i].SetLogNumber(0);
-    edits[i].SetNextFile(2);
-    if (i != (kAtomicGroupSize / 2)) {
-      edits[i].MarkAtomicGroup(--remaining);
-    }
-    edits[i].SetLastSequence(last_seqno++);
-  }
-  Status s;
-  for (const auto& edit : edits) {
-    std::string record;
-    edit.EncodeTo(&record);
-    s = log_writer->AddRecord(record);
-    ASSERT_OK(s);
-  }
-  log_writer.reset();
-
-  s = SetCurrentFile(env_, dbname_, 1, nullptr);
-  ASSERT_OK(s);
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+  // Write the last record. The reactive version set should now apply all
+  // edits.
+  std::string last_record;
+  edits_[kAtomicGroupSize - 1].EncodeTo(&last_record);
+  EXPECT_OK(log_writer_->AddRecord(last_record));
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  // Reactive version set should be empty now.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+}
 
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  // No edits in an atomic group.
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  // Write a few edits in an atomic group.
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  EXPECT_EQ(0, num_applied_edits_);
+}
 
-  bool mixed = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits[kAtomicGroupSize / 2].DebugString(), e->DebugString());
-        mixed = true;
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
-  EXPECT_NOK(versions_->Recover(column_families, false));
-  EXPECT_EQ(column_families.size(),
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
             versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
-  EXPECT_TRUE(mixed);
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
 }
 
-TEST_F(VersionSetTest, HandleIncorrectAtomicGroupSize) {
-  std::vector<ColumnFamilyDescriptor> column_families;
-  SequenceNumber last_seqno;
-  std::unique_ptr<log::Writer> log_writer;
-  PrepareManifest(&column_families, &last_seqno, &log_writer);
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
 
-  // Append multiple version edits that form an atomic group
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) {
   const int kAtomicGroupSize = 4;
-  std::vector<VersionEdit> edits(kAtomicGroupSize);
-  int remaining = kAtomicGroupSize;
-  for (size_t i = 0; i != edits.size(); ++i) {
-    edits[i].SetLogNumber(0);
-    edits[i].SetNextFile(2);
-    if (i != 1) {
-      edits[i].MarkAtomicGroup(--remaining);
-    } else {
-      edits[i].MarkAtomicGroup(remaining--);
-    }
-    edits[i].SetLastSequence(last_seqno++);
-  }
-  Status s;
-  for (const auto& edit : edits) {
-    std::string record;
-    edit.EncodeTo(&record);
-    s = log_writer->AddRecord(record);
-    ASSERT_OK(s);
-  }
-  log_writer.reset();
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  // Write the corrupted edits.
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
 
-  s = SetCurrentFile(env_, dbname_, 1, nullptr);
-  ASSERT_OK(s);
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
 
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
 
-  bool incorrect_group_size = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "VersionSet::Recover:IncorrectAtomicGroupSize", [&](void* arg) {
-        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
-        EXPECT_EQ(edits[1].DebugString(), e->DebugString());
-        incorrect_group_size = true;
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
-  EXPECT_NOK(versions_->Recover(column_families, false));
-  EXPECT_EQ(column_families.size(),
-            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
-  EXPECT_TRUE(incorrect_group_size);
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_OK(
+      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
 }
 
 class VersionSetTestDropOneCF : public VersionSetTestBase,
@@ -1088,7 +1271,6 @@ INSTANTIATE_TEST_CASE_P(
     testing::Values(VersionSetTestBase::kColumnFamilyName1,
                     VersionSetTestBase::kColumnFamilyName2,
                     VersionSetTestBase::kColumnFamilyName3));
-
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 62511819e4d..d3f59a10e3f 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -9,32 +9,28 @@
 
 #include "db/wal_manager.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
-#include <vector>
+#include <cinttypes>
 #include <memory>
+#include <vector>
 
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "db/transaction_log_impl.h"
 #include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sequence_file_reader.h"
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
-#include "util/file_util.h"
-#include "util/filename.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -191,7 +187,8 @@ void WalManager::PurgeObsoleteWALFiles() {
           continue;
         }
         if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) {
-          s = DeleteDBFile(&db_options_, file_path, archival_dir, false);
+          s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                           /*force_fg=*/!wal_in_db_path_);
           if (!s.ok()) {
             ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s",
                            file_path.c_str(), s.ToString().c_str());
@@ -217,7 +214,8 @@ void WalManager::PurgeObsoleteWALFiles() {
             log_file_size = std::max(log_file_size, file_size);
             ++log_files_num;
           } else {
-            s = DeleteDBFile(&db_options_, file_path, archival_dir, false);
+            s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                             /*force_fg=*/!wal_in_db_path_);
             if (!s.ok()) {
               ROCKS_LOG_WARN(db_options_.info_log,
                              "Unable to delete file: %s: %s", file_path.c_str(),
@@ -257,7 +255,8 @@ void WalManager::PurgeObsoleteWALFiles() {
   for (size_t i = 0; i < files_del_num; ++i) {
     std::string const file_path = archived_logs[i]->PathName();
     s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path,
-                     db_options_.wal_dir, false);
+                     db_options_.wal_dir, false,
+                     /*force_fg=*/!wal_in_db_path_);
     if (!s.ok()) {
       ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s",
                      file_path.c_str(), s.ToString().c_str());
@@ -281,17 +280,6 @@ void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
                  s.ToString().c_str());
 }
 
-namespace {
-struct CompareLogByPointer {
-  bool operator()(const std::unique_ptr<LogFile>& a,
-                  const std::unique_ptr<LogFile>& b) {
-    LogFileImpl* a_impl = static_cast_with_check<LogFileImpl, LogFile>(a.get());
-    LogFileImpl* b_impl = static_cast_with_check<LogFileImpl, LogFile>(b.get());
-    return *a_impl < *b_impl;
-  }
-};
-}
-
 Status WalManager::GetSortedWalsOfType(const std::string& path,
                                        VectorLogPtr& log_files,
                                        WalFileType log_type) {
@@ -324,14 +312,15 @@ Status WalManager::GetSortedWalsOfType(const std::string& path,
       uint64_t size_bytes;
       s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
       // re-try in case the alive log file has been moved to archive.
-      std::string archived_file = ArchivedLogFileName(path, number);
-      if (!s.ok() && log_type == kAliveLogFile &&
-          env_->FileExists(archived_file).ok()) {
-        s = env_->GetFileSize(archived_file, &size_bytes);
-        if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
-          // oops, the file just got deleted from archived dir! move on
-          s = Status::OK();
-          continue;
+      if (!s.ok() && log_type == kAliveLogFile) {
+        std::string archived_file = ArchivedLogFileName(path, number);
+        if (env_->FileExists(archived_file).ok()) {
+          s = env_->GetFileSize(archived_file, &size_bytes);
+          if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+            // oops, the file just got deleted from archived dir! move on
+            s = Status::OK();
+            continue;
+          }
         }
       }
       if (!s.ok()) {
@@ -342,8 +331,15 @@ Status WalManager::GetSortedWalsOfType(const std::string& path,
           new LogFileImpl(number, log_type, sequence, size_bytes)));
     }
   }
-  CompareLogByPointer compare_log_files;
-  std::sort(log_files.begin(), log_files.end(), compare_log_files);
+  std::sort(
+      log_files.begin(), log_files.end(),
+      [](const std::unique_ptr<LogFile>& a, const std::unique_ptr<LogFile>& b) {
+        LogFileImpl* a_impl =
+            static_cast_with_check<LogFileImpl, LogFile>(a.get());
+        LogFileImpl* b_impl =
+            static_cast_with_check<LogFileImpl, LogFile>(b.get());
+        return *a_impl < *b_impl;
+      });
   return status;
 }
 
@@ -393,7 +389,7 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
   if (type == kAliveLogFile) {
     std::string fname = LogFileName(db_options_.wal_dir, number);
     s = ReadFirstLine(fname, number, sequence);
-    if (env_->FileExists(fname).ok() && !s.ok()) {
+    if (!s.ok() && env_->FileExists(fname).ok()) {
       // return any error that is not caused by non-existing file
       return s;
     }
@@ -419,6 +415,32 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
   return s;
 }
 
+Status WalManager::GetLiveWalFile(uint64_t number,
+                                  std::unique_ptr<LogFile>* log_file) {
+  if (!log_file) {
+    return Status::InvalidArgument("log_file not preallocated.");
+  }
+
+  if (!number) {
+    return Status::PathNotFound("log file not available");
+  }
+
+  Status s;
+
+  uint64_t size_bytes;
+  s = env_->GetFileSize(LogFileName(db_options_.wal_dir, number), &size_bytes);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  log_file->reset(new LogFileImpl(number, kAliveLogFile,
+                                  0,  // SequenceNumber
+                                  size_bytes));
+
+  return Status::OK();
+}
+
 // the function returns status.ok() and sequence == 0 if the file exists, but is
 // empty
 Status WalManager::ReadFirstLine(const std::string& fname,
diff --git a/db/wal_manager.h b/db/wal_manager.h
index 6caf1640c06..97211f0003a 100644
--- a/db/wal_manager.h
+++ b/db/wal_manager.h
@@ -18,6 +18,7 @@
 #include <memory>
 
 #include "db/version_set.h"
+#include "file/file_util.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
@@ -28,6 +29,10 @@
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
+
+// WAL manager provides the abstraction for reading the WAL files as a single
+// unit. Internally, it opens and reads the files using Reader or Writer
+// abstraction.
 class WalManager {
  public:
   WalManager(const ImmutableDBOptions& db_options,
@@ -36,10 +41,13 @@ class WalManager {
         env_options_(env_options),
         env_(db_options.env),
         purge_wal_files_last_run_(0),
-        seq_per_batch_(seq_per_batch) {}
+        seq_per_batch_(seq_per_batch),
+        wal_in_db_path_(IsWalDirSameAsDBPath(&db_options)) {}
 
   Status GetSortedWalFiles(VectorLogPtr& files);
 
+  // Allow user to tail transaction log to find all recent changes to the
+  // database that are newer than `seq_number`.
   Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
       const TransactionLogIterator::ReadOptions& read_options,
@@ -51,6 +59,8 @@ class WalManager {
 
   Status DeleteFile(const std::string& fname, uint64_t number);
 
+  Status GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log_file);
+
   Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
                               SequenceNumber* sequence) {
     return ReadFirstRecord(type, number, sequence);
@@ -91,6 +101,8 @@ class WalManager {
 
   bool seq_per_batch_;
 
+  bool wal_in_db_path_;
+
   // obsolete files will be deleted every this seconds if ttl deletion is
   // enabled and archive size_limit is disabled.
   static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 379f12f52aa..4f15a064d1f 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -13,16 +13,16 @@
 #include "rocksdb/write_buffer_manager.h"
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/log_writer.h"
 #include "db/version_set.h"
 #include "db/wal_manager.h"
 #include "env/mock_env.h"
+#include "file/writable_file_writer.h"
 #include "table/mock_table.h"
-#include "util/file_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -50,7 +50,8 @@ class WalManagerTest : public testing::Test {
 
     versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
                                    table_cache_.get(), &write_buffer_manager_,
-                                   &write_controller_));
+                                   &write_controller_,
+                                   /*block_cache_tracer=*/nullptr));
 
     wal_manager_.reset(new WalManager(db_options_, env_options_));
   }
@@ -293,6 +294,29 @@ TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
   ASSERT_TRUE(!iter->Valid());
 }
 
+TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
+  Init();
+  CreateArchiveLogs(2, 100);
+  auto iter = OpenTransactionLogIter(0);
+  CreateArchiveLogs(1, 100);
+  int i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 200);
+  // A new log file was added after the iterator was created.
+  // TryAgain indicates a new iterator is needed to fetch the new data
+  ASSERT_TRUE(iter->status().IsTryAgain());
+
+  iter = OpenTransactionLogIter(0);
+  i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 300);
+  ASSERT_TRUE(iter->status().ok());
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 939b595305b..350c1a1c072 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -39,20 +39,23 @@
 #include <stack>
 #include <stdexcept>
 #include <type_traits>
+#include <unordered_map>
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/flush_scheduler.h"
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
 #include "db/write_batch_internal.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/merge_operator.h"
 #include "util/autovector.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/duplicate_detector.h"
 #include "util/string_util.h"
@@ -135,6 +138,105 @@ struct BatchContentClassifier : public WriteBatch::Handler {
   }
 };
 
+class TimestampAssigner : public WriteBatch::Handler {
+ public:
+  explicit TimestampAssigner(const Slice& ts)
+      : timestamp_(ts), timestamps_(kEmptyTimestampList) {}
+  explicit TimestampAssigner(const std::vector<Slice>& ts_list)
+      : timestamps_(ts_list) {
+    SanityCheck();
+  }
+  ~TimestampAssigner() override {}
+
+  Status PutCF(uint32_t, const Slice& key, const Slice&) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t, const Slice& key) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status SingleDeleteCF(uint32_t, const Slice& key) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status DeleteRangeCF(uint32_t, const Slice& begin_key,
+                       const Slice& end_key) override {
+    AssignTimestamp(begin_key);
+    AssignTimestamp(end_key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status MergeCF(uint32_t, const Slice& key, const Slice&) override {
+    AssignTimestamp(key);
+    ++idx_;
+    return Status::OK();
+  }
+
+  Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+    // TODO (yanqin): support blob db in the future.
+    return Status::OK();
+  }
+
+  Status MarkBeginPrepare(bool) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice&) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice&) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice&) override {
+    // TODO (yanqin): support in the future.
+    return Status::OK();
+  }
+
+ private:
+  void SanityCheck() const {
+    assert(!timestamps_.empty());
+#ifndef NDEBUG
+    const size_t ts_sz = timestamps_[0].size();
+    for (size_t i = 1; i != timestamps_.size(); ++i) {
+      assert(ts_sz == timestamps_[i].size());
+    }
+#endif  // !NDEBUG
+  }
+
+  void AssignTimestamp(const Slice& key) {
+    assert(timestamps_.empty() || idx_ < timestamps_.size());
+    const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_];
+    size_t ts_sz = ts.size();
+    char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
+    memcpy(ptr, ts.data(), ts_sz);
+  }
+
+  static const std::vector<Slice> kEmptyTimestampList;
+  const Slice timestamp_;
+  const std::vector<Slice>& timestamps_;
+  size_t idx_ = 0;
+
+  // No copy or move.
+  TimestampAssigner(const TimestampAssigner&) = delete;
+  TimestampAssigner(TimestampAssigner&&) = delete;
+  TimestampAssigner& operator=(const TimestampAssigner&) = delete;
+  TimestampAssigner&& operator=(TimestampAssigner&&) = delete;
+};
+const std::vector<Slice> TimestampAssigner::kEmptyTimestampList;
+
 }  // anon namespace
 
 struct SavePoints {
@@ -142,7 +244,15 @@ struct SavePoints {
 };
 
 WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes)
-    : content_flags_(0), max_bytes_(max_bytes), rep_() {
+    : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(0) {
+  rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
+                   ? reserved_bytes
+                   : WriteBatchInternal::kHeader);
+  rep_.resize(WriteBatchInternal::kHeader);
+}
+
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz)
+    : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) {
   rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader) ?
     reserved_bytes : WriteBatchInternal::kHeader);
   rep_.resize(WriteBatchInternal::kHeader);
@@ -151,18 +261,21 @@ WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes)
 WriteBatch::WriteBatch(const std::string& rep)
     : content_flags_(ContentFlags::DEFERRED),
       max_bytes_(0),
-      rep_(rep) {}
+      rep_(rep),
+      timestamp_size_(0) {}
 
 WriteBatch::WriteBatch(std::string&& rep)
     : content_flags_(ContentFlags::DEFERRED),
       max_bytes_(0),
-      rep_(std::move(rep)) {}
+      rep_(std::move(rep)),
+      timestamp_size_(0) {}
 
 WriteBatch::WriteBatch(const WriteBatch& src)
     : wal_term_point_(src.wal_term_point_),
       content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
       max_bytes_(src.max_bytes_),
-      rep_(src.rep_) {
+      rep_(src.rep_),
+      timestamp_size_(src.timestamp_size_) {
   if (src.save_points_ != nullptr) {
     save_points_.reset(new SavePoints());
     save_points_->stack = src.save_points_->stack;
@@ -174,7 +287,8 @@ WriteBatch::WriteBatch(WriteBatch&& src) noexcept
       wal_term_point_(std::move(src.wal_term_point_)),
       content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
       max_bytes_(src.max_bytes_),
-      rep_(std::move(src.rep_)) {}
+      rep_(std::move(src.rep_)),
+      timestamp_size_(src.timestamp_size_) {}
 
 WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
   if (&src != this) {
@@ -220,9 +334,7 @@ void WriteBatch::Clear() {
   wal_term_point_.clear();
 }
 
-int WriteBatch::Count() const {
-  return WriteBatchInternal::Count(this);
-}
+uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); }
 
 uint32_t WriteBatch::ComputeContentFlags() const {
   auto rv = content_flags_.load(std::memory_order_relaxed);
@@ -400,19 +512,32 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
 }
 
 Status WriteBatch::Iterate(Handler* handler) const {
-  Slice input(rep_);
-  if (input.size() < WriteBatchInternal::kHeader) {
+  if (rep_.size() < WriteBatchInternal::kHeader) {
     return Status::Corruption("malformed WriteBatch (too small)");
   }
 
-  input.remove_prefix(WriteBatchInternal::kHeader);
+  return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader,
+                                     rep_.size());
+}
+
+Status WriteBatchInternal::Iterate(const WriteBatch* wb,
+                                   WriteBatch::Handler* handler, size_t begin,
+                                   size_t end) {
+  if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) {
+    return Status::Corruption("Invalid start/end bounds for Iterate");
+  }
+  assert(begin <= end);
+  Slice input(wb->rep_.data() + begin, static_cast<size_t>(end - begin));
+  bool whole_batch =
+      (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size());
+
   Slice key, value, blob, xid;
   // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as
   // the batch boundary symbols otherwise we would mis-count the number of
   // batches. We do that by checking whether the accumulated batch is empty
   // before seeing the next Noop.
   bool empty_batch = true;
-  int found = 0;
+  uint32_t found = 0;
   Status s;
   char tag = 0;
   uint32_t column_family = 0;  // default
@@ -436,7 +561,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
       }
     } else {
       assert(s.IsTryAgain());
-      assert(!last_was_try_again); // to detect infinite loop bugs
+      assert(!last_was_try_again);  // to detect infinite loop bugs
       if (UNLIKELY(last_was_try_again)) {
         return Status::Corruption(
             "two consecutive TryAgain in WriteBatch handler; this is either a "
@@ -449,7 +574,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
     switch (tag) {
       case kTypeColumnFamilyValue:
       case kTypeValue:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
         s = handler->PutCF(column_family, key, value);
         if (LIKELY(s.ok())) {
@@ -459,7 +584,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilyDeletion:
       case kTypeDeletion:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
         s = handler->DeleteCF(column_family, key);
         if (LIKELY(s.ok())) {
@@ -469,7 +594,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilySingleDeletion:
       case kTypeSingleDeletion:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
         s = handler->SingleDeleteCF(column_family, key);
         if (LIKELY(s.ok())) {
@@ -479,7 +604,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilyRangeDeletion:
       case kTypeRangeDeletion:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
         s = handler->DeleteRangeCF(column_family, key, value);
         if (LIKELY(s.ok())) {
@@ -489,7 +614,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilyMerge:
       case kTypeMerge:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
         s = handler->MergeCF(column_family, key, value);
         if (LIKELY(s.ok())) {
@@ -499,7 +624,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         break;
       case kTypeColumnFamilyBlobIndex:
       case kTypeBlobIndex:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
         s = handler->PutBlobIndexCF(column_family, key, value);
         if (LIKELY(s.ok())) {
@@ -512,7 +637,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         empty_batch = false;
         break;
       case kTypeBeginPrepareXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
         handler->MarkBeginPrepare();
         empty_batch = false;
@@ -531,7 +656,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         }
         break;
       case kTypeBeginPersistedPrepareXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
         handler->MarkBeginPrepare();
         empty_batch = false;
@@ -544,7 +669,7 @@ Status WriteBatch::Iterate(Handler* handler) const {
         }
         break;
       case kTypeBeginUnprepareXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
         handler->MarkBeginPrepare(true /* unprepared */);
         empty_batch = false;
@@ -563,19 +688,19 @@ Status WriteBatch::Iterate(Handler* handler) const {
         }
         break;
       case kTypeEndPrepareXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
         handler->MarkEndPrepare(xid);
         empty_batch = true;
         break;
       case kTypeCommitXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
         handler->MarkCommit(xid);
         empty_batch = true;
         break;
       case kTypeRollbackXID:
-        assert(content_flags_.load(std::memory_order_relaxed) &
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
         handler->MarkRollback(xid);
         empty_batch = true;
@@ -591,7 +716,8 @@ Status WriteBatch::Iterate(Handler* handler) const {
   if (!s.ok()) {
     return s;
   }
-  if (handler_continue && found != WriteBatchInternal::Count(this)) {
+  if (handler_continue && whole_batch &&
+      found != WriteBatchInternal::Count(wb)) {
     return Status::Corruption("WriteBatch has wrong count");
   } else {
     return Status::OK();
@@ -606,11 +732,11 @@ void WriteBatchInternal::SetAsLastestPersistentState(WriteBatch* b) {
   b->is_latest_persistent_state_ = true;
 }
 
-int WriteBatchInternal::Count(const WriteBatch* b) {
+uint32_t WriteBatchInternal::Count(const WriteBatch* b) {
   return DecodeFixed32(b->rep_.data() + 8);
 }
 
-void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) {
   EncodeFixed32(&b->rep_[8], n);
 }
 
@@ -643,7 +769,14 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
     PutVarint32(&b->rep_, column_family_id);
   }
-  PutLengthPrefixedSlice(&b->rep_, key);
+  if (0 == b->timestamp_size_) {
+    PutLengthPrefixedSlice(&b->rep_, key);
+  } else {
+    PutVarint32(&b->rep_,
+                static_cast<uint32_t>(key.size() + b->timestamp_size_));
+    b->rep_.append(key.data(), key.size());
+    b->rep_.append(b->timestamp_size_, '\0');
+  }
   PutLengthPrefixedSlice(&b->rep_, value);
   b->content_flags_.store(
       b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
@@ -692,7 +825,11 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
     PutVarint32(&b->rep_, column_family_id);
   }
-  PutLengthPrefixedSliceParts(&b->rep_, key);
+  if (0 == b->timestamp_size_) {
+    PutLengthPrefixedSliceParts(&b->rep_, key);
+  } else {
+    PutLengthPrefixedSlicePartsWithPadding(&b->rep_, key, b->timestamp_size_);
+  }
   PutLengthPrefixedSliceParts(&b->rep_, value);
   b->content_flags_.store(
       b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
@@ -1011,7 +1148,7 @@ Status WriteBatch::RollbackToSavePoint() {
   save_points_->stack.pop();
 
   assert(savepoint.size <= rep_.size());
-  assert(savepoint.count <= Count());
+  assert(static_cast<uint32_t>(savepoint.count) <= Count());
 
   if (savepoint.size == rep_.size()) {
     // No changes to rollback
@@ -1038,11 +1175,22 @@ Status WriteBatch::PopSavePoint() {
   return Status::OK();
 }
 
+Status WriteBatch::AssignTimestamp(const Slice& ts) {
+  TimestampAssigner ts_assigner(ts);
+  return Iterate(&ts_assigner);
+}
+
+Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list) {
+  TimestampAssigner ts_assigner(ts_list);
+  return Iterate(&ts_assigner);
+}
+
 class MemTableInserter : public WriteBatch::Handler {
 
   SequenceNumber sequence_;
   ColumnFamilyMemTables* const cf_mems_;
   FlushScheduler* const flush_scheduler_;
+  TrimHistoryScheduler* const trim_history_scheduler_;
   const bool ignore_missing_column_families_;
   const uint64_t recovering_log_number_;
   // log number that all Memtables inserted into should reference
@@ -1076,6 +1224,22 @@ class MemTableInserter : public WriteBatch::Handler {
   DupDetector       duplicate_detector_;
   bool              dup_dectector_on_;
 
+  bool hint_per_batch_;
+  bool hint_created_;
+  // Hints for this batch
+  using HintMap = std::unordered_map<MemTable*, void*>;
+  using HintMapType = std::aligned_storage<sizeof(HintMap)>::type;
+  HintMapType hint_;
+
+  HintMap& GetHintMap() {
+    assert(hint_per_batch_);
+    if (!hint_created_) {
+      new (&hint_) HintMap();
+      hint_created_ = true;
+    }
+    return *reinterpret_cast<HintMap*>(&hint_);
+  }
+
   MemPostInfoMap& GetPostMap() {
     assert(concurrent_memtable_writes_);
     if(!post_info_created_) {
@@ -1104,18 +1268,20 @@ class MemTableInserter : public WriteBatch::Handler {
   // cf_mems should not be shared with concurrent inserters
   MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
                    FlushScheduler* flush_scheduler,
+                   TrimHistoryScheduler* trim_history_scheduler,
                    bool ignore_missing_column_families,
                    uint64_t recovering_log_number, DB* db,
                    bool concurrent_memtable_writes,
                    bool* has_valid_writes = nullptr, bool seq_per_batch = false,
-                   bool batch_per_txn = true)
+                   bool batch_per_txn = true, bool hint_per_batch = false)
       : sequence_(_sequence),
         cf_mems_(cf_mems),
         flush_scheduler_(flush_scheduler),
+        trim_history_scheduler_(trim_history_scheduler),
         ignore_missing_column_families_(ignore_missing_column_families),
         recovering_log_number_(recovering_log_number),
         log_number_ref_(0),
-        db_(reinterpret_cast<DBImpl*>(db)),
+        db_(static_cast_with_check<DBImpl, DB>(db)),
         concurrent_memtable_writes_(concurrent_memtable_writes),
         post_info_created_(false),
         has_valid_writes_(has_valid_writes),
@@ -1131,7 +1297,9 @@ class MemTableInserter : public WriteBatch::Handler {
         write_before_prepare_(!batch_per_txn),
         unprepared_batch_(false),
         duplicate_detector_(),
-        dup_dectector_on_(false) {
+        dup_dectector_on_(false),
+        hint_per_batch_(hint_per_batch),
+        hint_created_(false) {
     assert(cf_mems_);
   }
 
@@ -1144,6 +1312,12 @@ class MemTableInserter : public WriteBatch::Handler {
       reinterpret_cast<MemPostInfoMap*>
         (&mem_post_info_map_)->~MemPostInfoMap();
     }
+    if (hint_created_) {
+      for (auto iter : GetHintMap()) {
+        delete[] reinterpret_cast<char*>(iter.second);
+      }
+      reinterpret_cast<HintMap*>(&hint_)->~HintMap();
+    }
     delete rebuilding_trx_;
   }
 
@@ -1253,7 +1427,8 @@ class MemTableInserter : public WriteBatch::Handler {
     if (!moptions->inplace_update_support) {
       bool mem_res =
           mem->Add(sequence_, value_type, key, value,
-                   concurrent_memtable_writes_, get_post_process_info(mem));
+                   concurrent_memtable_writes_, get_post_process_info(mem),
+                   hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
       if (UNLIKELY(!mem_res)) {
         assert(seq_per_batch_);
         ret_status = Status::TryAgain("key+seq exists");
@@ -1336,7 +1511,8 @@ class MemTableInserter : public WriteBatch::Handler {
     MemTable* mem = cf_mems_->GetMemTable();
     bool mem_res =
         mem->Add(sequence_, delete_type, key, value,
-                 concurrent_memtable_writes_, get_post_process_info(mem));
+                 concurrent_memtable_writes_, get_post_process_info(mem),
+                 hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
     if (UNLIKELY(!mem_res)) {
       assert(seq_per_batch_);
       ret_status = Status::TryAgain("key+seq exists");
@@ -1471,7 +1647,6 @@ class MemTableInserter : public WriteBatch::Handler {
 
   Status MergeCF(uint32_t column_family_id, const Slice& key,
                  const Slice& value) override {
-    assert(!concurrent_memtable_writes_);
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
       WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
@@ -1498,6 +1673,8 @@ class MemTableInserter : public WriteBatch::Handler {
     MemTable* mem = cf_mems_->GetMemTable();
     auto* moptions = mem->GetImmutableMemTableOptions();
     bool perform_merge = false;
+    assert(!concurrent_memtable_writes_ ||
+           moptions->max_successive_merges == 0);
 
     // If we pass DB through and options.max_successive_merges is hit
     // during recovery, Get() will be issued which will try to acquire
@@ -1505,6 +1682,7 @@ class MemTableInserter : public WriteBatch::Handler {
     // So we disable merge in recovery
     if (moptions->max_successive_merges > 0 && db_ != nullptr &&
         recovering_log_number_ == 0) {
+      assert(!concurrent_memtable_writes_);
       LookupKey lkey(key, sequence_);
 
       // Count the number of successive merges at the head
@@ -1550,6 +1728,7 @@ class MemTableInserter : public WriteBatch::Handler {
         perform_merge = false;
       } else {
         // 3) Add value to memtable
+        assert(!concurrent_memtable_writes_);
         bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value);
         if (UNLIKELY(!mem_res)) {
           assert(seq_per_batch_);
@@ -1562,7 +1741,9 @@ class MemTableInserter : public WriteBatch::Handler {
 
     if (!perform_merge) {
       // Add merge operator to memtable
-      bool mem_res = mem->Add(sequence_, kTypeMerge, key, value);
+      bool mem_res =
+          mem->Add(sequence_, kTypeMerge, key, value,
+                   concurrent_memtable_writes_, get_post_process_info(mem));
       if (UNLIKELY(!mem_res)) {
         assert(seq_per_batch_);
         ret_status = Status::TryAgain("key+seq exists");
@@ -1597,7 +1778,20 @@ class MemTableInserter : public WriteBatch::Handler {
           cfd->mem()->MarkFlushScheduled()) {
         // MarkFlushScheduled only returns true if we are the one that
         // should take action, so no need to dedup further
-        flush_scheduler_->ScheduleFlush(cfd);
+        flush_scheduler_->ScheduleWork(cfd);
+      }
+    }
+    // check if memtable_list size exceeds max_write_buffer_size_to_maintain
+    if (trim_history_scheduler_ != nullptr) {
+      auto* cfd = cf_mems_->current();
+      assert(cfd != nullptr);
+      if (cfd->ioptions()->max_write_buffer_size_to_maintain > 0 &&
+          cfd->mem()->ApproximateMemoryUsageFast() +
+                  cfd->imm()->ApproximateMemoryUsageExcludingLast() >=
+              static_cast<size_t>(
+                  cfd->ioptions()->max_write_buffer_size_to_maintain) &&
+          cfd->imm()->MarkTrimHistoryNeeded()) {
+        trim_history_scheduler_->ScheduleWork(cfd);
       }
     }
   }
@@ -1757,12 +1951,14 @@ class MemTableInserter : public WriteBatch::Handler {
 Status WriteBatchInternal::InsertInto(
     WriteThread::WriteGroup& write_group, SequenceNumber sequence,
     ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
     bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
     bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
   MemTableInserter inserter(
-      sequence, memtables, flush_scheduler, ignore_missing_column_families,
-      recovery_log_number, db, concurrent_memtable_writes,
-      nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
+      sequence, memtables, flush_scheduler, trim_history_scheduler,
+      ignore_missing_column_families, recovery_log_number, db,
+      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
+      batch_per_txn);
   for (auto w : write_group) {
     if (w->CallbackFailed()) {
       continue;
@@ -1788,17 +1984,19 @@ Status WriteBatchInternal::InsertInto(
 Status WriteBatchInternal::InsertInto(
     WriteThread::Writer* writer, SequenceNumber sequence,
     ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
     bool ignore_missing_column_families, uint64_t log_number, DB* db,
     bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
-    bool batch_per_txn) {
+    bool batch_per_txn, bool hint_per_batch) {
 #ifdef NDEBUG
   (void)batch_cnt;
 #endif
   assert(writer->ShouldWriteToMemtable());
   MemTableInserter inserter(
-      sequence, memtables, flush_scheduler, ignore_missing_column_families,
-      log_number, db, concurrent_memtable_writes, nullptr /*has_valid_writes*/,
-      seq_per_batch, batch_per_txn);
+      sequence, memtables, flush_scheduler, trim_history_scheduler,
+      ignore_missing_column_families, log_number, db,
+      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
+      batch_per_txn, hint_per_batch);
   SetSequence(writer->batch, sequence);
   inserter.set_log_number_ref(writer->log_ref);
   Status s = writer->batch->Iterate(&inserter);
@@ -1812,11 +2010,13 @@ Status WriteBatchInternal::InsertInto(
 
 Status WriteBatchInternal::InsertInto(
     const WriteBatch* batch, ColumnFamilyMemTables* memtables,
-    FlushScheduler* flush_scheduler, bool ignore_missing_column_families,
-    uint64_t log_number, DB* db, bool concurrent_memtable_writes,
-    SequenceNumber* next_seq, bool* has_valid_writes, bool seq_per_batch,
-    bool batch_per_txn) {
+    FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t log_number, DB* db,
+    bool concurrent_memtable_writes, SequenceNumber* next_seq,
+    bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) {
   MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
+                            trim_history_scheduler,
                             ignore_missing_column_families, log_number, db,
                             concurrent_memtable_writes, has_valid_writes,
                             seq_per_batch, batch_per_txn);
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index bae62bf0317..3810c672272 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -9,11 +9,13 @@
 
 #pragma once
 #include <vector>
+#include "db/flush_scheduler.h"
+#include "db/trim_history_scheduler.h"
 #include "db/write_thread.h"
-#include "rocksdb/types.h"
-#include "rocksdb/write_batch.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
@@ -113,10 +115,10 @@ class WriteBatchInternal {
   static Status InsertNoop(WriteBatch* batch);
 
   // Return the number of entries in the batch.
-  static int Count(const WriteBatch* batch);
+  static uint32_t Count(const WriteBatch* batch);
 
   // Set the count for the number of entries in the batch.
-  static void SetCount(WriteBatch* batch, int n);
+  static void SetCount(WriteBatch* batch, uint32_t n);
 
   // Return the sequence number for the start of this batch.
   static SequenceNumber Sequence(const WriteBatch* batch);
@@ -162,6 +164,7 @@ class WriteBatchInternal {
   static Status InsertInto(
       WriteThread::WriteGroup& write_group, SequenceNumber sequence,
       ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
       bool ignore_missing_column_families = false, uint64_t log_number = 0,
       DB* db = nullptr, bool concurrent_memtable_writes = false,
       bool seq_per_batch = false, bool batch_per_txn = true);
@@ -171,6 +174,7 @@ class WriteBatchInternal {
   static Status InsertInto(
       const WriteBatch* batch, ColumnFamilyMemTables* memtables,
       FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
       bool ignore_missing_column_families = false, uint64_t log_number = 0,
       DB* db = nullptr, bool concurrent_memtable_writes = false,
       SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
@@ -179,11 +183,13 @@ class WriteBatchInternal {
   static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
                            ColumnFamilyMemTables* memtables,
                            FlushScheduler* flush_scheduler,
+                           TrimHistoryScheduler* trim_history_scheduler,
                            bool ignore_missing_column_families = false,
                            uint64_t log_number = 0, DB* db = nullptr,
                            bool concurrent_memtable_writes = false,
                            bool seq_per_batch = false, size_t batch_cnt = 0,
-                           bool batch_per_txn = true);
+                           bool batch_per_txn = true,
+                           bool hint_per_batch = false);
 
   static Status Append(WriteBatch* dst, const WriteBatch* src,
                        const bool WAL_only = false);
@@ -192,6 +198,10 @@ class WriteBatchInternal {
   // leftByteSize and a WriteBatch with ByteSize rightByteSize
   static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
 
+  // Iterate over [begin, end) range of a write batch
+  static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler,
+                        size_t begin, size_t end);
+
   // This write batch includes the latest state that should be persisted. Such
   // state meant to be used only during recovery.
   static void SetAsLastestPersistentState(WriteBatch* b);
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 322bd8945b0..869cfa8cb72 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -18,8 +18,8 @@
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
+#include "test_util/testharness.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
@@ -35,8 +35,9 @@ static std::string PrintContents(WriteBatch* b) {
   mem->Ref();
   std::string state;
   ColumnFamilyMemTablesDefault cf_mems_default(mem);
-  Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr);
-  int count = 0;
+  Status s =
+      WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr);
+  uint32_t count = 0;
   int put_count = 0;
   int delete_count = 0;
   int single_delete_count = 0;
@@ -131,8 +132,8 @@ class WriteBatchTest : public testing::Test {};
 TEST_F(WriteBatchTest, Empty) {
   WriteBatch batch;
   ASSERT_EQ("", PrintContents(&batch));
-  ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
-  ASSERT_EQ(0, batch.Count());
+  ASSERT_EQ(0u, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(0u, batch.Count());
 }
 
 TEST_F(WriteBatchTest, Multiple) {
@@ -143,14 +144,14 @@ TEST_F(WriteBatchTest, Multiple) {
   batch.Put(Slice("baz"), Slice("boo"));
   WriteBatchInternal::SetSequence(&batch, 100);
   ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
-  ASSERT_EQ(4, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
   ASSERT_EQ(
       "Put(baz, boo)@103"
       "Delete(box)@101"
       "Put(foo, bar)@100"
       "DeleteRange(bar, foo)@102",
       PrintContents(&batch));
-  ASSERT_EQ(4, batch.Count());
+  ASSERT_EQ(4u, batch.Count());
 }
 
 TEST_F(WriteBatchTest, Corruption) {
@@ -173,19 +174,19 @@ TEST_F(WriteBatchTest, Append) {
   WriteBatchInternal::Append(&b1, &b2);
   ASSERT_EQ("",
             PrintContents(&b1));
-  ASSERT_EQ(0, b1.Count());
+  ASSERT_EQ(0u, b1.Count());
   b2.Put("a", "va");
   WriteBatchInternal::Append(&b1, &b2);
   ASSERT_EQ("Put(a, va)@200",
             PrintContents(&b1));
-  ASSERT_EQ(1, b1.Count());
+  ASSERT_EQ(1u, b1.Count());
   b2.Clear();
   b2.Put("b", "vb");
   WriteBatchInternal::Append(&b1, &b2);
   ASSERT_EQ("Put(a, va)@200"
             "Put(b, vb)@201",
             PrintContents(&b1));
-  ASSERT_EQ(2, b1.Count());
+  ASSERT_EQ(2u, b1.Count());
   b2.Delete("foo");
   WriteBatchInternal::Append(&b1, &b2);
   ASSERT_EQ("Put(a, va)@200"
@@ -193,7 +194,7 @@ TEST_F(WriteBatchTest, Append) {
             "Put(b, vb)@201"
             "Delete(foo)@203",
             PrintContents(&b1));
-  ASSERT_EQ(4, b1.Count());
+  ASSERT_EQ(4u, b1.Count());
   b2.Clear();
   b2.Put("c", "cc");
   b2.Put("d", "dd");
@@ -208,29 +209,29 @@ TEST_F(WriteBatchTest, Append) {
       "Put(d, dd)@205"
       "Delete(foo)@203",
       PrintContents(&b1));
-  ASSERT_EQ(6, b1.Count());
+  ASSERT_EQ(6u, b1.Count());
   ASSERT_EQ(
       "Put(c, cc)@0"
       "Put(d, dd)@1"
       "Put(e, ee)@2",
       PrintContents(&b2));
-  ASSERT_EQ(3, b2.Count());
+  ASSERT_EQ(3u, b2.Count());
 }
 
 TEST_F(WriteBatchTest, SingleDeletion) {
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
   ASSERT_EQ("", PrintContents(&batch));
-  ASSERT_EQ(0, batch.Count());
+  ASSERT_EQ(0u, batch.Count());
   batch.Put("a", "va");
   ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   batch.SingleDelete("a");
   ASSERT_EQ(
       "SingleDelete(a)@101"
       "Put(a, va)@100",
       PrintContents(&batch));
-  ASSERT_EQ(2, batch.Count());
+  ASSERT_EQ(2u, batch.Count());
 }
 
 namespace {
@@ -316,7 +317,7 @@ namespace {
 TEST_F(WriteBatchTest, PutNotImplemented) {
   WriteBatch batch;
   batch.Put(Slice("k1"), Slice("v1"));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
@@ -326,7 +327,7 @@ TEST_F(WriteBatchTest, PutNotImplemented) {
 TEST_F(WriteBatchTest, DeleteNotImplemented) {
   WriteBatch batch;
   batch.Delete(Slice("k2"));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
@@ -336,7 +337,7 @@ TEST_F(WriteBatchTest, DeleteNotImplemented) {
 TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
   WriteBatch batch;
   batch.SingleDelete(Slice("k2"));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
@@ -346,7 +347,7 @@ TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
 TEST_F(WriteBatchTest, MergeNotImplemented) {
   WriteBatch batch;
   batch.Merge(Slice("foo"), Slice("bar"));
-  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
@@ -363,7 +364,7 @@ TEST_F(WriteBatchTest, Blob) {
   batch.SingleDelete(Slice("k3"));
   batch.PutLogData(Slice("blob2"));
   batch.Merge(Slice("foo"), Slice("bar"));
-  ASSERT_EQ(6, batch.Count());
+  ASSERT_EQ(6u, batch.Count());
   ASSERT_EQ(
       "Merge(foo, bar)@5"
       "Put(k1, v1)@0"
@@ -398,7 +399,7 @@ TEST_F(WriteBatchTest, PrepareCommit) {
   ASSERT_EQ(s, Status::NotFound());
   WriteBatchInternal::MarkCommit(&batch, Slice("xid1"));
   WriteBatchInternal::MarkRollback(&batch, Slice("xid1"));
-  ASSERT_EQ(2, batch.Count());
+  ASSERT_EQ(2u, batch.Count());
 
   TestHandler handler;
   batch.Iterate(&handler);
@@ -488,7 +489,7 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
     batch.Put(raw, raw);
   }
 
-  ASSERT_EQ(2, batch.Count());
+  ASSERT_EQ(2u, batch.Count());
 
   struct NoopHandler : public WriteBatch::Handler {
     int num_seen = 0;
@@ -599,7 +600,7 @@ TEST_F(WriteBatchTest, PutGatherSlices) {
             "Put(foo, bar)@100"
             "Put(keypart2part3, value)@102",
             PrintContents(&batch));
-  ASSERT_EQ(3, batch.Count());
+  ASSERT_EQ(3u, batch.Count());
 }
 
 namespace {
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index cb880560efc..1ab97b04589 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -11,14 +11,14 @@
 #include <utility>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/write_callback.h"
+#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/write_batch.h"
-#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
 
 using std::string;
 
@@ -124,6 +124,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
       {false, false, true, false, true},
   };
 
+  for (auto& unordered_write : {true, false}) {
   for (auto& seq_per_batch : {true, false}) {
   for (auto& two_queues : {true, false}) {
     for (auto& allow_parallel : {true, false}) {
@@ -133,15 +134,22 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
             for (auto& write_group : write_scenarios) {
               Options options;
               options.create_if_missing = true;
+              options.unordered_write = unordered_write;
               options.allow_concurrent_memtable_write = allow_parallel;
               options.enable_pipelined_write = enable_pipelined_write;
               options.two_write_queues = two_queues;
+              // Skip unsupported combinations
               if (options.enable_pipelined_write && seq_per_batch) {
-                // This combination is not supported
                 continue;
               }
               if (options.enable_pipelined_write && options.two_write_queues) {
-                // This combination is not supported
+                continue;
+              }
+              if (options.unordered_write &&
+                  !options.allow_concurrent_memtable_write) {
+                continue;
+              }
+              if (options.unordered_write && options.enable_pipelined_write) {
                 continue;
               }
 
@@ -296,7 +304,8 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
                     PublishSeqCallback(DBImpl* db_impl_in)
                         : db_impl_(db_impl_in) {}
                     Status Callback(SequenceNumber last_seq, bool /*not used*/,
-                                    uint64_t) override {
+                                    uint64_t, size_t /*index*/,
+                                    size_t /*total*/) override {
                       db_impl_->SetLastPublishedSequence(last_seq);
                       return Status::OK();
                     }
@@ -358,8 +367,9 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
         }
       }
     }
-}
-}
+  }
+  }
+  }
 }
 
 TEST_F(WriteCallbackTest, WriteCallBackTest) {
diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc
index 55feb00a339..919c2c11808 100644
--- a/db/write_controller_test.cc
+++ b/db/write_controller_test.cc
@@ -8,7 +8,7 @@
 #include "db/write_controller.h"
 
 #include "rocksdb/env.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/write_thread.cc b/db/write_thread.cc
index 835992c8fce..1ded68fde3b 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -9,8 +9,8 @@
 #include "db/column_family.h"
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
+#include "test_util/sync_point.h"
 #include "util/random.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -22,6 +22,8 @@ WriteThread::WriteThread(const ImmutableDBOptions& db_options)
       allow_concurrent_memtable_write_(
           db_options.allow_concurrent_memtable_write),
       enable_pipelined_write_(db_options.enable_pipelined_write),
+      max_write_batch_group_size_bytes(
+          db_options.max_write_batch_group_size_bytes),
       newest_writer_(nullptr),
       newest_memtable_writer_(nullptr),
       last_sequence_(0),
@@ -406,9 +408,10 @@ size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
   // Allow the group to grow up to a maximum size, but if the
   // original write is small, limit the growth so we do not slow
   // down the small write too much.
-  size_t max_size = 1 << 20;
-  if (size <= (128 << 10)) {
-    max_size = size + (128 << 10);
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
   }
 
   leader->write_group = write_group;
@@ -485,9 +488,10 @@ void WriteThread::EnterAsMemTableWriter(Writer* leader,
   // Allow the group to grow up to a maximum size, but if the
   // original write is small, limit the growth so we do not slow
   // down the small write too much.
-  size_t max_size = 1 << 20;
-  if (size <= (128 << 10)) {
-    max_size = size + (128 << 10);
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
   }
 
   leader->write_group = write_group;
diff --git a/db/write_thread.h b/db/write_thread.h
index dc9c22ff87e..e1db970663b 100644
--- a/db/write_thread.h
+++ b/db/write_thread.h
@@ -360,6 +360,11 @@ class WriteThread {
   // Enable pipelined write to WAL and memtable.
   const bool enable_pipelined_write_;
 
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  const uint64_t max_write_batch_group_size_bytes;
+
   // Points to the newest pending writer. Only leader can remove
   // elements, adding can be done lock-free by anybody.
   std::atomic<Writer*> newest_writer_;
diff --git a/defs.bzl b/defs.bzl
index f3e8339783e..83e9a579f9e 100644
--- a/defs.bzl
+++ b/defs.bzl
@@ -1,4 +1,7 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# defs.bzl - Definitions for Facebook-specific buck build integration
+# in TARGETS
 
 load("@fbcode_macros//build_defs:cpp_binary.bzl", "cpp_binary")
 load("@fbcode_macros//build_defs:custom_unittest.bzl", "custom_unittest")
@@ -8,9 +11,13 @@ def test_binary(
         test_cc,
         parallelism,
         rocksdb_arch_preprocessor_flags,
+        rocksdb_os_preprocessor_flags,
         rocksdb_compiler_flags,
         rocksdb_preprocessor_flags,
-        rocksdb_external_deps):
+        rocksdb_external_deps,
+        rocksdb_os_deps,
+        extra_deps,
+        extra_compiler_flags):
     TEST_RUNNER = native.package_name() + "/buckifier/rocks_test_runner.sh"
 
     ttype = "gtest" if parallelism == "parallel" else "simple"
@@ -20,9 +27,11 @@ def test_binary(
         name = test_bin,
         srcs = [test_cc],
         arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-        compiler_flags = rocksdb_compiler_flags,
+        os_preprocessor_flags = rocksdb_os_preprocessor_flags,
+        compiler_flags = rocksdb_compiler_flags + extra_compiler_flags,
         preprocessor_flags = rocksdb_preprocessor_flags,
-        deps = [":rocksdb_test_lib"],
+        deps = [":rocksdb_test_lib"] + extra_deps,
+        os_deps = rocksdb_os_deps,
         external_deps = rocksdb_external_deps,
     )
 
diff --git a/docs/.gitignore b/docs/.gitignore
index e48dc98be89..3938549cbe6 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -6,4 +6,3 @@ _site
 .sass-cache
 *.psd
 *~
-
diff --git a/docs/_posts/2019-08-15-unordered-write.markdown b/docs/_posts/2019-08-15-unordered-write.markdown
new file mode 100644
index 00000000000..5f0eb2880a4
--- /dev/null
+++ b/docs/_posts/2019-08-15-unordered-write.markdown
@@ -0,0 +1,56 @@
+---
+title: Higher write throughput with `unordered_write` feature
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+Since RocksDB 6.3, The `unordered_write=`true option together with WritePrepared transactions offers 34-42% higher write throughput compared to vanilla RocksDB. If the application can handle more relaxed ordering guarantees, the gain in throughput would increase to 63-131%.
+
+### Background
+
+Currently RocksDB API delivers the following powerful guarantees:
+- Atomic reads: Either all of a write batch is visible to reads or none of it.
+- Read-your-own writes: When a write thread returns to the user, a subsequent read by the same thread will be able to see its own writes.
+- Immutable Snapshots: The reads visible to the snapshot are immutable in the sense that it will not be affected by any in-flight or future writes.
+
+### `unordered_write`
+
+The `unordered_write` feature, when turned on, relaxes the default guarantees of RocksDB. While it still gives read-your-own-write property, neither atomic reads nor the immutable snapshot properties are provided any longer. However, RocksDB users could still get read-your-own-write and immutable snapshots when using this feature in conjunction with TransactionDB configured with WritePrepared transactions and `two_write_queues`. You can read [here](https://github.com/facebook/rocksdb/wiki/unordered_write) to learn about the design of `unordered_write` and [here](https://github.com/facebook/rocksdb/wiki/WritePrepared-Transactions) to learn more about WritePrepared transactions.
+
+### How to use it?
+
+To get the same guarantees as vanilla RocksdB:
+
+    DBOptions db_options;
+    db_options.unordered_write = true;
+    db_options.two_write_queues = true;
+    DB* db;
+    {
+      TransactionDBOptions txn_db_options;
+      txn_db_options.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
+      txn_db_options.skip_concurrency_control = true;
+      TransactionDB* txn_db;
+      TransactionDB::Open(options, txn_db_options, kDBPath, &txn_db);
+      db = txn_db;
+    }
+    db->Write(...);
+
+To get relaxed guarantees:
+
+    DBOptions db_options;
+    db_options.unordered_write = true;
+    DB* db;
+    DB::Open(db_options, kDBPath, &db);
+    db->Write(...);
+
+# Benchmarks
+
+    TEST_TMPDIR=/dev/shm/ ~/db_bench --benchmarks=fillrandom --threads=32 --num=10000000 -max_write_buffer_number=16 --max_background_jobs=64 --batch_size=8 --writes=3000000 -level0_file_num_compaction_trigger=99999 --level0_slowdown_writes_trigger=99999 --level0_stop_writes_trigger=99999 -enable_pipelined_write=false -disable_auto_compactions --transaction_db=true --unordered_write=1 --disable_wal=0
+
+Throughput with `unordered_write`=true and using WritePrepared transaction:
+- WAL: +42%
+- No-WAL: +34%
+Throughput with `unordered_write`=true
+- WAL: +63%
+- NoWAL: +131%
diff --git a/env/env.cc b/env/env.cc
index dcf79fb7fe7..6aad4a53e81 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -10,11 +10,13 @@
 #include "rocksdb/env.h"
 
 #include <thread>
+#include "logging/env_logger.h"
+#include "memory/arena.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "port/sys_time.h"
 #include "rocksdb/options.h"
-#include "util/arena.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
@@ -22,6 +24,55 @@ namespace rocksdb {
 Env::~Env() {
 }
 
+Status Env::NewLogger(const std::string& fname,
+                      std::shared_ptr<Logger>* result) {
+  return NewEnvLogger(fname, this, result);
+}
+
+Status Env::LoadEnv(const std::string& value, Env** result) {
+  Env* env = *result;
+  Status s;
+#ifndef ROCKSDB_LITE
+  s = ObjectRegistry::NewInstance()->NewStaticObject<Env>(value, &env);
+#else
+  s = Status::NotSupported("Cannot load environment in LITE mode: ", value);
+#endif
+  if (s.ok()) {
+    *result = env;
+  }
+  return s;
+}
+
+Status Env::LoadEnv(const std::string& value, Env** result,
+                    std::shared_ptr<Env>* guard) {
+  assert(result);
+  Status s;
+#ifndef ROCKSDB_LITE
+  Env* env = nullptr;
+  std::unique_ptr<Env> uniq_guard;
+  std::string err_msg;
+  assert(guard != nullptr);
+  env = ObjectRegistry::NewInstance()->NewObject<Env>(value, &uniq_guard,
+                                                      &err_msg);
+  if (!env) {
+    s = Status::NotFound(std::string("Cannot load ") + Env::Type() + ": " +
+                         value);
+    env = Env::Default();
+  }
+  if (s.ok() && uniq_guard) {
+    guard->reset(uniq_guard.release());
+    *result = guard->get();
+  } else {
+    *result = env;
+  }
+#else
+  (void)result;
+  (void)guard;
+  s = Status::NotSupported("Cannot load environment in LITE mode: ", value);
+#endif
+  return s;
+}
+
 std::string Env::PriorityToString(Env::Priority priority) {
   switch (priority) {
     case Env::Priority::BOTTOM:
@@ -369,6 +420,7 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
       options.writable_file_max_buffer_size;
   env_options->allow_fallocate = options.allow_fallocate;
   env_options->strict_bytes_per_sync = options.strict_bytes_per_sync;
+  options.env->SanitizeEnvOptions(env_options);
 }
 
 }
@@ -422,5 +474,20 @@ EnvOptions::EnvOptions() {
   AssignEnvOptions(this, options);
 }
 
+Status NewEnvLogger(const std::string& fname, Env* env,
+                    std::shared_ptr<Logger>* result) {
+  EnvOptions options;
+  // TODO: Tune the buffer size.
+  options.writable_file_max_buffer_size = 1024 * 1024;
+  std::unique_ptr<WritableFile> writable_file;
+  const auto status = env->NewWritableFile(fname, &writable_file, options);
+  if (!status.ok()) {
+    return status;
+  }
+
+  *result = std::make_shared<EnvLogger>(std::move(writable_file), fname,
+                                        options, env);
+  return Status::OK();
+}
 
 }  // namespace rocksdb
diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc
index 93764d945f9..c955bdb7141 100644
--- a/env/env_basic_test.cc
+++ b/env/env_basic_test.cc
@@ -11,8 +11,7 @@
 
 #include "env/mock_env.h"
 #include "rocksdb/env.h"
-#include "rocksdb/utilities/object_registry.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
@@ -104,13 +103,12 @@ namespace {
 // ValuesIn() will skip running tests when given an empty collection.
 std::vector<Env*> GetCustomEnvs() {
   static Env* custom_env;
-  static std::unique_ptr<Env> custom_env_guard;
   static bool init = false;
   if (!init) {
     init = true;
     const char* uri = getenv("TEST_ENV_URI");
     if (uri != nullptr) {
-      custom_env = NewCustomObject<Env>(uri, &custom_env_guard);
+      Env::LoadEnv(uri, &custom_env);
     }
   }
 
diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index df1b0011a01..b7095c0f579 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -195,23 +195,26 @@ class EncryptedWritableFile : public WritableFileWrapper {
   EncryptedWritableFile(WritableFile* f, BlockAccessCipherStream* s, size_t prefixLength)
     : WritableFileWrapper(f), file_(f), stream_(s), prefixLength_(prefixLength) { }
 
-  Status Append(const Slice& data) override { 
+  Status Append(const Slice& data) override {
     AlignedBuffer buf;
     Status status;
-    Slice dataToAppend(data); 
+    Slice dataToAppend(data);
     if (data.size() > 0) {
       auto offset = file_->GetFileSize(); // size including prefix
       // Encrypt in cloned buffer
       buf.Alignment(GetRequiredBufferAlignment());
       buf.AllocateNewBuffer(data.size());
+      // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove
+      // so that the next two lines can be replaced with buf.Append().
       memmove(buf.BufferStart(), data.data(), data.size());
-      status = stream_->Encrypt(offset, buf.BufferStart(), data.size());
+      buf.Size(data.size());
+      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
       if (!status.ok()) {
         return status;
       }
-      dataToAppend = Slice(buf.BufferStart(), data.size());
+      dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
     }
-    status = file_->Append(dataToAppend); 
+    status = file_->Append(dataToAppend);
     if (!status.ok()) {
       return status;
     }
@@ -221,18 +224,19 @@ class EncryptedWritableFile : public WritableFileWrapper {
   Status PositionedAppend(const Slice& data, uint64_t offset) override {
     AlignedBuffer buf;
     Status status;
-    Slice dataToAppend(data); 
+    Slice dataToAppend(data);
     offset += prefixLength_;
     if (data.size() > 0) {
       // Encrypt in cloned buffer
       buf.Alignment(GetRequiredBufferAlignment());
       buf.AllocateNewBuffer(data.size());
       memmove(buf.BufferStart(), data.data(), data.size());
-      status = stream_->Encrypt(offset, buf.BufferStart(), data.size());
+      buf.Size(data.size());
+      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
       if (!status.ok()) {
         return status;
       }
-      dataToAppend = Slice(buf.BufferStart(), data.size());
+      dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
     }
     status = file_->PositionedAppend(dataToAppend, offset);
     if (!status.ok()) {
@@ -325,18 +329,19 @@ class EncryptedRandomRWFile : public RandomRWFile {
   Status Write(uint64_t offset, const Slice& data) override {
     AlignedBuffer buf;
     Status status;
-    Slice dataToWrite(data); 
+    Slice dataToWrite(data);
     offset += prefixLength_;
     if (data.size() > 0) {
       // Encrypt in cloned buffer
       buf.Alignment(GetRequiredBufferAlignment());
       buf.AllocateNewBuffer(data.size());
       memmove(buf.BufferStart(), data.data(), data.size());
-      status = stream_->Encrypt(offset, buf.BufferStart(), data.size());
+      buf.Size(data.size());
+      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
       if (!status.ok()) {
         return status;
       }
-      dataToWrite = Slice(buf.BufferStart(), data.size());
+      dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize());
     }
     status = file_->Write(offset, dataToWrite);
     return status;
@@ -393,13 +398,14 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Read prefix 
+      // Read prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       status = underlying->Read(prefixLength, &prefixSlice, prefixBuf.BufferStart());
       if (!status.ok()) {
         return status;
       }
+      prefixBuf.Size(prefixLength);
     }
     // Create cipher stream
     std::unique_ptr<BlockAccessCipherStream> stream;
@@ -430,13 +436,14 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Read prefix 
+      // Read prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart());
       if (!status.ok()) {
         return status;
       }
+      prefixBuf.Size(prefixLength);
     }
     // Create cipher stream
     std::unique_ptr<BlockAccessCipherStream> stream;
@@ -467,12 +474,13 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Initialize prefix 
+      // Initialize prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-      prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength);
-      // Write prefix 
+      prefixBuf.Size(prefixLength);
+      prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
+      // Write prefix
       status = underlying->Append(prefixSlice);
       if (!status.ok()) {
         return status;
@@ -513,12 +521,13 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Initialize prefix 
+      // Initialize prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-      prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength);
-      // Write prefix 
+      prefixBuf.Size(prefixLength);
+      prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
+      // Write prefix
       status = underlying->Append(prefixSlice);
       if (!status.ok()) {
         return status;
@@ -554,12 +563,13 @@ class EncryptedEnv : public EnvWrapper {
     Slice prefixSlice;
     size_t prefixLength = provider_->GetPrefixLength();
     if (prefixLength > 0) {
-      // Initialize prefix 
+      // Initialize prefix
       prefixBuf.Alignment(underlying->GetRequiredBufferAlignment());
       prefixBuf.AllocateNewBuffer(prefixLength);
       provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-      prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength);
-      // Write prefix 
+      prefixBuf.Size(prefixLength);
+      prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
+      // Write prefix
       status = underlying->Append(prefixSlice);
       if (!status.ok()) {
         return status;
@@ -609,11 +619,13 @@ class EncryptedEnv : public EnvWrapper {
         if (!status.ok()) {
           return status;
         }
+        prefixBuf.Size(prefixLength);
       } else {
-        // File is new, initialize & write prefix 
+        // File is new, initialize & write prefix
         provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength);
-        prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength);
-        // Write prefix 
+        prefixBuf.Size(prefixLength);
+        prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize());
+        // Write prefix
         status = underlying->Write(0, prefixSlice);
         if (!status.ok()) {
           return status;
@@ -630,7 +642,7 @@ class EncryptedEnv : public EnvWrapper {
     return Status::OK();
   }
 
-    // Store in *result the attributes of the children of the specified directory.
+  // Store in *result the attributes of the children of the specified directory.
   // In case the implementation lists the directory prior to iterating the files
   // and files are concurrently deleted, the deleted files will be omitted from
   // result.
@@ -670,8 +682,7 @@ class EncryptedEnv : public EnvWrapper {
   EncryptionProvider *provider_;
 };
 
-
-// Returns an Env that encrypts data when stored on disk and decrypts data when 
+// Returns an Env that encrypts data when stored on disk and decrypts data when
 // read from disk.
 Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider) {
   return new EncryptedEnv(base_env, provider);
@@ -694,14 +705,14 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t
     char *block = data;
     size_t n = std::min(dataSize, blockSize - blockOffset);
     if (n != blockSize) {
-      // We're not encrypting a full block. 
+      // We're not encrypting a full block.
       // Copy data to blockBuffer
       if (!blockBuffer.get()) {
         // Allocate buffer
         blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
       }
       block = blockBuffer.get();
-      // Copy plain data to block buffer 
+      // Copy plain data to block buffer
       memmove(block + blockOffset, data, n);
     }
     auto status = EncryptBlock(blockIndex, block, (char*)scratch.data());
@@ -734,21 +745,19 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t
   std::string scratch;
   AllocateScratch(scratch);
 
-  assert(fileOffset < dataSize);
-
   // Decrypt individual blocks.
   while (1) {
     char *block = data;
     size_t n = std::min(dataSize, blockSize - blockOffset);
     if (n != blockSize) {
-      // We're not decrypting a full block. 
+      // We're not decrypting a full block.
       // Copy data to blockBuffer
       if (!blockBuffer.get()) {
         // Allocate buffer
         blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
       }
       block = blockBuffer.get();
-      // Copy encrypted data to block buffer 
+      // Copy encrypted data to block buffer
       memmove(block + blockOffset, data, n);
     }
     auto status = DecryptBlock(blockIndex, block, (char*)scratch.data());
@@ -807,7 +816,7 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra
   memmove(scratch, iv_.data(), blockSize);
   EncodeFixed64(scratch, blockIndex + initialCounter_);
 
-  // Encrypt nonce+counter 
+  // Encrypt nonce+counter
   auto status = cipher_.Encrypt(scratch);
   if (!status.ok()) {
     return status;
@@ -823,13 +832,13 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra
 // Decrypt a block of data at the given block index.
 // Length of data is equal to BlockSize();
 Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scratch) {
-  // For CTR decryption & encryption are the same 
+  // For CTR decryption & encryption are the same
   return EncryptBlock(blockIndex, data, scratch);
 }
 
 // GetPrefixLength returns the length of the prefix that is added to every file
 // and used for storing encryption options.
-// For optimal performance, the prefix length should be a multiple of 
+// For optimal performance, the prefix length should be a multiple of
 // the page size.
 size_t CTREncryptionProvider::GetPrefixLength() {
   return defaultPrefixLength;
@@ -844,7 +853,7 @@ static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t &
   iv = Slice(prefix + blockSize, blockSize);
 }
 
-// CreateNewPrefix initialized an allocated block of prefix memory 
+// CreateNewPrefix initialized an allocated block of prefix memory
 // for a new file.
 Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/,
                                               char* prefix,
@@ -873,7 +882,7 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/,
   return Status::OK();
 }
 
-// PopulateSecretPrefixPart initializes the data into a new prefix block 
+// PopulateSecretPrefixPart initializes the data into a new prefix block
 // in plain text.
 // Returns the amount of space (starting from the start of the prefix)
 // that has been initialized.
@@ -908,7 +917,7 @@ Status CTREncryptionProvider::CreateCipherStream(
     return status;
   }
 
-  // Create cipher stream 
+  // Create cipher stream
   return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, prefix, result);
 }
 
diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc
index 9d0354cced8..207f0815bc4 100644
--- a/env/env_hdfs.cc
+++ b/env/env_hdfs.cc
@@ -17,8 +17,8 @@
 #include <algorithm>
 #include <iostream>
 #include <sstream>
+#include "logging/logging.h"
 #include "rocksdb/status.h"
-#include "util/logging.h"
 #include "util/string_util.h"
 
 #define HDFS_EXISTS 0
@@ -420,7 +420,7 @@ Status HdfsEnv::NewRandomAccessFile(const std::string& fname,
 // create a new file for writing
 Status HdfsEnv::NewWritableFile(const std::string& fname,
                                 std::unique_ptr<WritableFile>* result,
-                                const EnvOptions& /*options*/) {
+                                const EnvOptions& options) {
   result->reset();
   Status s;
   HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options);
@@ -590,6 +590,11 @@ Status HdfsEnv::UnlockFile(FileLock* /*lock*/) { return Status::OK(); }
 
 Status HdfsEnv::NewLogger(const std::string& fname,
                           std::shared_ptr<Logger>* result) {
+  // EnvOptions is used exclusively for its `strict_bytes_per_sync` value. That
+  // option is only intended for WAL/flush/compaction writes, so turn it off in
+  // the logger.
+  EnvOptions options;
+  options.strict_bytes_per_sync = false;
   HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options);
   if (f == nullptr || !f->isValid()) {
     delete f;
diff --git a/env/env_posix.cc b/env/env_posix.cc
index 387c0279397..83e209bf1f8 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -7,8 +7,12 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors
 #include <dirent.h>
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+#include <dlfcn.h>
+#endif
 #include <errno.h>
 #include <fcntl.h>
+
 #if defined(OS_LINUX)
 #include <linux/fs.h>
 #endif
@@ -33,6 +37,7 @@
 // Get nano time includes
 #if defined(OS_LINUX) || defined(OS_FREEBSD)
 #elif defined(__MACH__)
+#include <Availability.h>
 #include <mach/clock.h>
 #include <mach/mach.h>
 #else
@@ -43,18 +48,18 @@
 #include <vector>
 
 #include "env/io_posix.h"
-#include "env/posix_logger.h"
+#include "logging/logging.h"
+#include "logging/posix_logger.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "port/port.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
-#include "util/logging.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 #include "util/thread_local.h"
 #include "util/threadpool_imp.h"
 
@@ -69,6 +74,17 @@
 #endif
 
 namespace rocksdb {
+#if defined(OS_WIN)
+static const std::string kSharedLibExt = ".dll";
+static const char kPathSeparator = ';';
+#else
+static const char kPathSeparator = ':';
+#if defined(OS_MACOSX)
+static const std::string kSharedLibExt = ".dylib";
+#else
+static const std::string kSharedLibExt = ".so";
+#endif
+#endif
 
 namespace {
 
@@ -115,6 +131,33 @@ int cloexec_flags(int flags, const EnvOptions* options) {
   return flags;
 }
 
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+class PosixDynamicLibrary : public DynamicLibrary {
+ public:
+  PosixDynamicLibrary(const std::string& name, void* handle)
+      : name_(name), handle_(handle) {}
+  ~PosixDynamicLibrary() override { dlclose(handle_); }
+
+  Status LoadSymbol(const std::string& sym_name, void** func) override {
+    assert(nullptr != func);
+    dlerror();  // Clear any old error
+    *func = dlsym(handle_, sym_name.c_str());
+    if (*func != nullptr) {
+      return Status::OK();
+    } else {
+      char* err = dlerror();
+      return Status::NotFound("Error finding symbol: " + sym_name, err);
+    }
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+ private:
+  std::string name_;
+  void* handle_;
+};
+#endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
 class PosixEnv : public Env {
  public:
   PosixEnv();
@@ -729,6 +772,62 @@ class PosixEnv : public Env {
     return result;
   }
 
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+  // Loads the named library into the result.
+  // If the input name is empty, the current executable is loaded
+  // On *nix systems, a "lib" prefix is added to the name if one is not supplied
+  // Comparably, the appropriate shared library extension is added to the name
+  // if not supplied. If search_path is not specified, the shared library will
+  // be loaded using the default path (LD_LIBRARY_PATH) If search_path is
+  // specified, the shared library will be searched for in the directories
+  // provided by the search path
+  Status LoadLibrary(const std::string& name, const std::string& path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    Status status;
+    assert(result != nullptr);
+    if (name.empty()) {
+      void* hndl = dlopen(NULL, RTLD_NOW);
+      if (hndl != nullptr) {
+        result->reset(new PosixDynamicLibrary(name, hndl));
+        return Status::OK();
+      }
+    } else {
+      std::string library_name = name;
+      if (library_name.find(kSharedLibExt) == std::string::npos) {
+        library_name = library_name + kSharedLibExt;
+      }
+#if !defined(OS_WIN)
+      if (library_name.find('/') == std::string::npos &&
+          library_name.compare(0, 3, "lib") != 0) {
+        library_name = "lib" + library_name;
+      }
+#endif
+      if (path.empty()) {
+        void* hndl = dlopen(library_name.c_str(), RTLD_NOW);
+        if (hndl != nullptr) {
+          result->reset(new PosixDynamicLibrary(library_name, hndl));
+          return Status::OK();
+        }
+      } else {
+        std::string local_path;
+        std::stringstream ss(path);
+        while (getline(ss, local_path, kPathSeparator)) {
+          if (!path.empty()) {
+            std::string full_name = local_path + "/" + library_name;
+            void* hndl = dlopen(full_name.c_str(), RTLD_NOW);
+            if (hndl != nullptr) {
+              result->reset(new PosixDynamicLibrary(full_name, hndl));
+              return Status::OK();
+            }
+          }
+        }
+      }
+    }
+    return Status::IOError(
+        IOErrorMsg("Failed to open shared library: xs", name), dlerror());
+  }
+#endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
   void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW,
                 void* tag = nullptr,
                 void (*unschedFunction)(void* arg) = nullptr) override;
@@ -789,13 +888,14 @@ class PosixEnv : public Env {
     FILE* f;
     {
       IOSTATS_TIMER_GUARD(open_nanos);
-      f = fopen(fname.c_str(), "w"
+      f = fopen(fname.c_str(),
+                "w"
 #ifdef __GLIBC_PREREQ
 #if __GLIBC_PREREQ(2, 7)
-          "e" // glibc extension to enable O_CLOEXEC
+                "e"  // glibc extension to enable O_CLOEXEC
 #endif
 #endif
-          );
+      );
     }
     if (f == nullptr) {
       result->reset();
@@ -839,7 +939,7 @@ class PosixEnv : public Env {
 
   uint64_t NowCPUNanos() override {
 #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) || \
-    defined(__MACH__)
+    (defined(__MACH__) && defined(__MAC_10_12))
     struct timespec ts;
     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
diff --git a/env/env_test.cc b/env/env_test.cc
index 47800928499..004adf26e5a 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -11,13 +11,6 @@
 #include <sys/ioctl.h>
 #endif
 
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 #include <sys/types.h>
 
 #include <iostream>
@@ -38,15 +31,16 @@
 #endif
 
 #include "env/env_chroot.h"
+#include "logging/log_buffer.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
-#include "util/log_buffer.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 #ifdef OS_LINUX
 static const size_t kPageSize = sysconf(_SC_PAGESIZE);
@@ -247,6 +241,52 @@ TEST_F(EnvPosixTest, MemoryMappedFileBuffer) {
   ASSERT_EQ(expected_data, actual_data);
 }
 
+#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
+TEST_F(EnvPosixTest, LoadRocksDBLibrary) {
+  std::shared_ptr<DynamicLibrary> library;
+  std::function<void*(void*, const char*)> function;
+  Status status = env_->LoadLibrary("no-such-library", "", &library);
+  ASSERT_NOK(status);
+  ASSERT_EQ(nullptr, library.get());
+  status = env_->LoadLibrary("rocksdb", "", &library);
+  if (status.ok()) {  // If we have can find a rocksdb shared library
+    ASSERT_NE(nullptr, library.get());
+    ASSERT_OK(library->LoadFunction("rocksdb_create_default_env",
+                                    &function));  // from C definition
+    ASSERT_NE(nullptr, function);
+    ASSERT_NOK(library->LoadFunction("no-such-method", &function));
+    ASSERT_EQ(nullptr, function);
+    ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+  } else {
+    ASSERT_EQ(nullptr, library.get());
+  }
+}
+#endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
+
+#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
+TEST_F(EnvPosixTest, LoadRocksDBLibraryWithSearchPath) {
+  std::shared_ptr<DynamicLibrary> library;
+  std::function<void*(void*, const char*)> function;
+  ASSERT_NOK(env_->LoadLibrary("no-such-library", "/tmp", &library));
+  ASSERT_EQ(nullptr, library.get());
+  ASSERT_NOK(env_->LoadLibrary("dl", "/tmp", &library));
+  ASSERT_EQ(nullptr, library.get());
+  Status status = env_->LoadLibrary("rocksdb", "/tmp:./", &library);
+  if (status.ok()) {
+    ASSERT_NE(nullptr, library.get());
+    ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+  }
+  char buff[1024];
+  std::string cwd = getcwd(buff, sizeof(buff));
+
+  status = env_->LoadLibrary("rocksdb", "/tmp:" + cwd, &library);
+  if (status.ok()) {
+    ASSERT_NE(nullptr, library.get());
+    ASSERT_OK(env_->LoadLibrary(library->Name(), "", &library));
+  }
+}
+#endif  // !OS_WIN && !ROCKSDB_NO_DYNAMIC_EXTENSION
+
 TEST_P(EnvPosixTestWithParam, UnSchedule) {
   std::atomic<bool> called(false);
   env_->SetBackgroundThreads(1, Env::LOW);
@@ -843,7 +883,9 @@ TEST_F(EnvPosixTest, PositionedAppend) {
 }
 #endif  // !ROCKSDB_LITE
 
-// Only works in linux platforms
+// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
+// handle a return value of zero but this test case cannot.
+#ifndef OS_WIN
 TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
   // Create file.
   if (env_ == Env::Default()) {
@@ -886,6 +928,7 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
     env_->DeleteFile(fname);
   }
 }
+#endif  // !defined(OS_WIN)
 
 // only works in linux platforms
 #ifdef ROCKSDB_FALLOCATE_PRESENT
@@ -976,7 +1019,9 @@ bool HasPrefix(const std::unordered_set<std::string>& ss) {
   return false;
 }
 
-// Only works in linux and WIN platforms
+// `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can
+// handle a return value of zero but this test case cannot.
+#ifndef OS_WIN
 TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
   if (env_ == Env::Default()) {
     // Check whether a bunch of concurrently existing files have unique IDs.
@@ -1018,7 +1063,6 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
   }
 }
 
-// Only works in linux and WIN platforms
 TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
   if (env_ == Env::Default()) {
     EnvOptions soptions;
@@ -1058,6 +1102,62 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
     ASSERT_TRUE(!HasPrefix(ids));
   }
 }
+#endif  // !defined(OS_WIN)
+
+TEST_P(EnvPosixTestWithParam, MultiRead) {
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  const size_t kSectorSize = 4096;
+  const size_t kNumSectors = 8;
+
+  // Create file.
+  {
+    std::unique_ptr<WritableFile> wfile;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+    if (soptions.use_direct_writes) {
+      soptions.use_direct_writes = false;
+    }
+#endif
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+    for (size_t i = 0; i < kNumSectors; ++i) {
+      auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+      Slice slice(data.get(), kSectorSize);
+      ASSERT_OK(wfile->Append(slice));
+    }
+    ASSERT_OK(wfile->Close());
+  }
+
+  // Random Read
+  {
+    std::unique_ptr<RandomAccessFile> file;
+    std::vector<ReadRequest> reqs(3);
+    std::vector<std::unique_ptr<char, Deleter>> data;
+    uint64_t offset = 0;
+    for (size_t i = 0; i < reqs.size(); ++i) {
+      reqs[i].offset = offset;
+      offset += 2 * kSectorSize;
+      reqs[i].len = kSectorSize;
+      data.emplace_back(NewAligned(kSectorSize, 0));
+      reqs[i].scratch = data.back().get();
+    }
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+    if (soptions.use_direct_reads) {
+      soptions.use_direct_reads = false;
+    }
+#endif
+    ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+    ASSERT_OK(file->MultiRead(reqs.data(), reqs.size()));
+    for (size_t i = 0; i < reqs.size(); ++i) {
+      auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i * 2 + 1));
+      ASSERT_OK(reqs[i].status);
+      ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0);
+    }
+  }
+}
 
 // Only works in linux platforms
 #ifdef OS_WIN
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 0f86c3ff93f..3572d7cc9a4 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -14,6 +14,9 @@
 #include <algorithm>
 #if defined(OS_LINUX)
 #include <linux/fs.h>
+#ifndef FALLOC_FL_KEEP_SIZE
+#include <linux/falloc.h>
+#endif
 #endif
 #include <stdio.h>
 #include <stdlib.h>
@@ -27,17 +30,16 @@
 #include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #endif
-#include "env/posix_logger.h"
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
 #include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
 #define F_LINUX_SPECIFIC_BASE 1024
-#define F_SET_RW_HINT         (F_LINUX_SPECIFIC_BASE + 12)
+#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
 #endif
 
 namespace rocksdb {
@@ -58,6 +60,57 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
 
 namespace {
 
+// On MacOS (and probably *BSD), the posix write and pwrite calls do not support
+// buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
+// cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
+// the writes aligned.
+
+bool PosixWrite(int fd, const char* buf, size_t nbyte) {
+  const size_t kLimit1Gb = 1UL << 30;
+
+  const char* src = buf;
+  size_t left = nbyte;
+
+  while (left != 0) {
+    size_t bytes_to_write = std::min(left, kLimit1Gb);
+
+    ssize_t done = write(fd, src, bytes_to_write);
+    if (done < 0) {
+      if (errno == EINTR) {
+        continue;
+      }
+      return false;
+    }
+    left -= done;
+    src += done;
+  }
+  return true;
+}
+
+bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
+  const size_t kLimit1Gb = 1UL << 30;
+
+  const char* src = buf;
+  size_t left = nbyte;
+
+  while (left != 0) {
+    size_t bytes_to_write = std::min(left, kLimit1Gb);
+
+    ssize_t done = pwrite(fd, src, bytes_to_write, offset);
+    if (done < 0) {
+      if (errno == EINTR) {
+        continue;
+      }
+      return false;
+    }
+    left -= done;
+    offset += done;
+    src += done;
+  }
+
+  return true;
+}
+
 size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
 #ifdef OS_LINUX
   struct stat buf;
@@ -135,28 +188,34 @@ size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
 #define ZFS_SUPER_MAGIC 0x2fc12fc1
 #endif
 
-bool IsSyncFileRangeSupported(int __attribute__((__unused__)) fd) {
-  // `fstatfs` is only available on Linux, but so is `sync_file_range`, so
-  // `defined(ROCKSDB_RANGESYNC_PRESENT)` should imply `defined(OS_LINUX)`.
+bool IsSyncFileRangeSupported(int fd) {
+  // The approach taken in this function is to build a blacklist of cases where
+  // we know `sync_file_range` definitely will not work properly despite passing
+  // the compile-time check (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or
+  // if any of the checks fail in unexpected ways, we allow `sync_file_range` to
+  // be used. This way should minimize risk of impacting existing use cases.
   struct statfs buf;
   int ret = fstatfs(fd, &buf);
   assert(ret == 0);
-  if (ret != 0) {
-    // We don't know whether the filesystem properly supports `sync_file_range`.
-    // Even if it doesn't, we don't know of any safety issue with trying to call
-    // it anyways. So, to preserve the same behavior as before this `fstatfs`
-    // check was introduced, we assume `sync_file_range` is usable.
-    return true;
-  }
-  if (buf.f_type == ZFS_SUPER_MAGIC) {
+  if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
     // Testing on ZFS showed the writeback did not happen asynchronously when
     // `sync_file_range` was called, even though it returned success. Avoid it
     // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
     // even though this'll incur extra I/O for metadata.
     return false;
   }
-  // No known problems with other filesystems' implementations of
-  // `sync_file_range`, so allow them to use it.
+
+  ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
+  assert(!(ret == -1 && errno != ENOSYS));
+  if (ret == -1 && errno == ENOSYS) {
+    // `sync_file_range` is not implemented on all platforms even if
+    // compile-time checks pass and a supported filesystem is in-use. For
+    // example, using ext4 on WSL (Windows Subsystem for Linux),
+    // `sync_file_range()` returns `ENOSYS`
+    // ("Function not implemented").
+    return false;
+  }
+  // None of the cases on the blacklist matched, so allow `sync_file_range` use.
   return true;
 }
 
@@ -180,7 +239,7 @@ bool IsSectorAligned(const void* ptr, size_t sector_size) {
   return uintptr_t(ptr) % sector_size == 0;
 }
 
-}
+}  // namespace
 #endif
 
 /*
@@ -747,14 +806,14 @@ Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
 
 #ifdef ROCKSDB_FALLOCATE_PRESENT
 Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
-  assert(offset <= std::numeric_limits<off_t>::max());
-  assert(len <= std::numeric_limits<off_t>::max());
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
   int alloc_status = 0;
   if (allow_fallocate_) {
-    alloc_status = fallocate(
-        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
-          static_cast<off_t>(offset), static_cast<off_t>(len));
+    alloc_status =
+        fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+                  static_cast<off_t>(offset), static_cast<off_t>(len));
   }
   if (alloc_status == 0) {
     return Status::OK();
@@ -801,19 +860,13 @@ Status PosixWritableFile::Append(const Slice& data) {
     assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
   }
   const char* src = data.data();
-  size_t left = data.size();
-  while (left != 0) {
-    ssize_t done = write(fd_, src, left);
-    if (done < 0) {
-      if (errno == EINTR) {
-        continue;
-      }
-      return IOError("While appending to file", filename_, errno);
-    }
-    left -= done;
-    src += done;
+  size_t nbytes = data.size();
+
+  if (!PosixWrite(fd_, src, nbytes)) {
+    return IOError("While appending to file", filename_, errno);
   }
-  filesize_ += data.size();
+
+  filesize_ += nbytes;
   return Status::OK();
 }
 
@@ -823,23 +876,14 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
     assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
     assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
   }
-  assert(offset <= std::numeric_limits<off_t>::max());
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   const char* src = data.data();
-  size_t left = data.size();
-  while (left != 0) {
-    ssize_t done = pwrite(fd_, src, left, static_cast<off_t>(offset));
-    if (done < 0) {
-      if (errno == EINTR) {
-        continue;
-      }
-      return IOError("While pwrite to file at offset " + ToString(offset),
-                     filename_, errno);
-    }
-    left -= done;
-    offset += done;
-    src += done;
+  size_t nbytes = data.size();
+  if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
+    return IOError("While pwrite to file at offset " + ToString(offset),
+                   filename_, errno);
   }
-  filesize_ = offset;
+  filesize_ = offset + nbytes;
   return Status::OK();
 }
 
@@ -891,8 +935,8 @@ Status PosixWritableFile::Close() {
     // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
     if (result == 0 &&
         (file_stats.st_size + file_stats.st_blksize - 1) /
-            file_stats.st_blksize !=
-        file_stats.st_blocks / (file_stats.st_blksize / 512)) {
+                file_stats.st_blksize !=
+            file_stats.st_blocks / (file_stats.st_blksize / 512)) {
       IOSTATS_TIMER_GUARD(allocate_nanos);
       if (allow_fallocate_) {
         fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
@@ -942,10 +986,10 @@ void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
   }
 #else
   (void)hint;
-#endif // ROCKSDB_VALGRIND_RUN
+#endif  // ROCKSDB_VALGRIND_RUN
 #else
   (void)hint;
-#endif // OS_LINUX
+#endif  // OS_LINUX
 }
 
 Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
@@ -968,15 +1012,15 @@ Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
 
 #ifdef ROCKSDB_FALLOCATE_PRESENT
 Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
-  assert(offset <= std::numeric_limits<off_t>::max());
-  assert(len <= std::numeric_limits<off_t>::max());
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
   IOSTATS_TIMER_GUARD(allocate_nanos);
   int alloc_status = 0;
   if (allow_fallocate_) {
-    alloc_status = fallocate(
-        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
-        static_cast<off_t>(offset), static_cast<off_t>(len));
+    alloc_status =
+        fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+                  static_cast<off_t>(offset), static_cast<off_t>(len));
   }
   if (alloc_status == 0) {
     return Status::OK();
@@ -990,8 +1034,8 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
 
 Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) {
 #ifdef ROCKSDB_RANGESYNC_PRESENT
-  assert(offset <= std::numeric_limits<off_t>::max());
-  assert(nbytes <= std::numeric_limits<off_t>::max());
+  assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
+  assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   if (sync_file_range_supported_) {
     int ret;
     if (strict_bytes_per_sync_) {
@@ -1037,24 +1081,11 @@ PosixRandomRWFile::~PosixRandomRWFile() {
 
 Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) {
   const char* src = data.data();
-  size_t left = data.size();
-  while (left != 0) {
-    ssize_t done = pwrite(fd_, src, left, offset);
-    if (done < 0) {
-      // error while writing to file
-      if (errno == EINTR) {
-        // write was interrupted, try again.
-        continue;
-      }
-      return IOError(
-          "While write random read/write file at offset " + ToString(offset),
-          filename_, errno);
-    }
-
-    // Wrote `done` bytes
-    left -= done;
-    offset += done;
-    src += done;
+  size_t nbytes = data.size();
+  if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
+    return IOError(
+        "While write random read/write file at offset " + ToString(offset),
+        filename_, errno);
   }
 
   return Status::OK();
diff --git a/env/mock_env.cc b/env/mock_env.cc
index 793a0837ab8..6d3adc808ed 100644
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@@ -31,6 +31,9 @@ class MemFile {
         rnd_(static_cast<uint32_t>(
             MurmurHash(fn.data(), static_cast<int>(fn.size()), 0))),
         fsynced_bytes_(0) {}
+  // No copying allowed.
+  MemFile(const MemFile&) = delete;
+  void operator=(const MemFile&) = delete;
 
   void Ref() {
     MutexLock lock(&mutex_);
@@ -154,10 +157,6 @@ class MemFile {
   // Private since only Unref() should be used to delete it.
   ~MemFile() { assert(refs_ == 0); }
 
-  // No copying allowed.
-  MemFile(const MemFile&);
-  void operator=(const MemFile&);
-
   Env* env_;
   const std::string fn_;
   mutable port::Mutex mutex_;
diff --git a/env/mock_env_test.cc b/env/mock_env_test.cc
index 97c49b5f516..b21b953b568 100644
--- a/env/mock_env_test.cc
+++ b/env/mock_env_test.cc
@@ -10,7 +10,7 @@
 #include <string>
 
 #include "rocksdb/env.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc
index b1c1d02ba25..921041a576e 100644
--- a/examples/multi_processes_example.cc
+++ b/examples/multi_processes_example.cc
@@ -14,8 +14,8 @@
 // run for a while, tailing the logs of the primary. After process with primary
 // instance exits, this process will keep running until you hit 'CTRL+C'.
 
-#include <inttypes.h>
 #include <chrono>
+#include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
diff --git a/examples/rocksdb_option_file_example.ini b/examples/rocksdb_option_file_example.ini
index 351f1ed0107..dcbc9a308a8 100644
--- a/examples/rocksdb_option_file_example.ini
+++ b/examples/rocksdb_option_file_example.ini
@@ -104,7 +104,7 @@
   compression=kSnappyCompression
   level0_file_num_compaction_trigger=4
   purge_redundant_kvs_while_flush=true
-  max_write_buffer_number_to_maintain=0
+  max_write_buffer_size_to_maintain=0
   memtable_factory=SkipListFactory
   max_grandparent_overlap_factor=8
   expanded_compaction_factor=25
diff --git a/examples/transaction_example.cc b/examples/transaction_example.cc
index 7274cf7ec07..6d12651ada9 100644
--- a/examples/transaction_example.cc
+++ b/examples/transaction_example.cc
@@ -50,17 +50,33 @@ int main() {
 
   // Read a key OUTSIDE this transaction. Does not affect txn.
   s = txn_db->Get(read_options, "abc", &value);
+  assert(s.IsNotFound());
 
   // Write a key OUTSIDE of this transaction.
-  // Does not affect txn since this is an unrelated key.  If we wrote key 'abc'
-  // here, the transaction would fail to commit.
+  // Does not affect txn since this is an unrelated key.
   s = txn_db->Put(write_options, "xyz", "zzz");
+  assert(s.ok());
+
+  // Write a key OUTSIDE of this transaction.
+  // Fail because the key conflicts with the key written in txn.
+  s = txn_db->Put(write_options, "abc", "def");
+  assert(s.subcode() == Status::kLockTimeout);
+
+  // Value for key "xyz" has been committed, can be read in txn.
+  s = txn->Get(read_options, "xyz", &value);
+  assert(s.ok());
+  assert(value == "zzz");
 
   // Commit transaction
   s = txn->Commit();
   assert(s.ok());
   delete txn;
 
+  // Value is committed, can be read now.
+  s = txn_db->Get(read_options, "abc", &value);
+  assert(s.ok());
+  assert(value == "def");
+
   ////////////////////////////////////////////////////////
   //
   // "Repeatable Read" (Snapshot Isolation) Example
diff --git a/util/delete_scheduler.cc b/file/delete_scheduler.cc
similarity index 98%
rename from util/delete_scheduler.cc
rename to file/delete_scheduler.cc
index f5ee2844896..b66956ca08c 100644
--- a/util/delete_scheduler.cc
+++ b/file/delete_scheduler.cc
@@ -5,17 +5,17 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "util/delete_scheduler.h"
+#include "file/delete_scheduler.h"
 
 #include <thread>
 #include <vector>
 
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
-#include "util/logging.h"
+#include "test_util/sync_point.h"
 #include "util/mutexlock.h"
-#include "util/sst_file_manager_impl.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/delete_scheduler.h b/file/delete_scheduler.h
similarity index 100%
rename from util/delete_scheduler.h
rename to file/delete_scheduler.h
diff --git a/util/delete_scheduler_test.cc b/file/delete_scheduler_test.cc
similarity index 99%
rename from util/delete_scheduler_test.cc
rename to file/delete_scheduler_test.cc
index 0d8e354b9c0..b6d4b903c16 100644
--- a/util/delete_scheduler_test.cc
+++ b/file/delete_scheduler_test.cc
@@ -3,23 +3,19 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <atomic>
+#include <cinttypes>
 #include <thread>
 #include <vector>
 
+#include "file/delete_scheduler.h"
+#include "file/sst_file_manager_impl.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "util/delete_scheduler.h"
-#include "util/sst_file_manager_impl.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 #ifndef ROCKSDB_LITE
 
diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc
new file mode 100644
index 00000000000..89f32c6ff0b
--- /dev/null
+++ b/file/file_prefetch_buffer.cc
@@ -0,0 +1,133 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/file_prefetch_buffer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
+                                    uint64_t offset, size_t n,
+                                    bool for_compaction) {
+  size_t alignment = reader->file()->GetRequiredBufferAlignment();
+  size_t offset_ = static_cast<size_t>(offset);
+  uint64_t rounddown_offset = Rounddown(offset_, alignment);
+  uint64_t roundup_end = Roundup(offset_ + n, alignment);
+  uint64_t roundup_len = roundup_end - rounddown_offset;
+  assert(roundup_len >= alignment);
+  assert(roundup_len % alignment == 0);
+
+  // Check if requested bytes are in the existing buffer_.
+  // If all bytes exist -- return.
+  // If only a few bytes exist -- reuse them & read only what is really needed.
+  //     This is typically the case of incremental reading of data.
+  // If no bytes exist in buffer -- full pread.
+
+  Status s;
+  uint64_t chunk_offset_in_buffer = 0;
+  uint64_t chunk_len = 0;
+  bool copy_data_to_new_buffer = false;
+  if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ &&
+      offset <= buffer_offset_ + buffer_.CurrentSize()) {
+    if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) {
+      // All requested bytes are already in the buffer. So no need to Read
+      // again.
+      return s;
+    } else {
+      // Only a few requested bytes are in the buffer. memmove those chunk of
+      // bytes to the beginning, and memcpy them back into the new buffer if a
+      // new buffer is created.
+      chunk_offset_in_buffer =
+          Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
+      chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
+      assert(chunk_offset_in_buffer % alignment == 0);
+      assert(chunk_len % alignment == 0);
+      assert(chunk_offset_in_buffer + chunk_len <=
+             buffer_offset_ + buffer_.CurrentSize());
+      if (chunk_len > 0) {
+        copy_data_to_new_buffer = true;
+      } else {
+        // this reset is not necessary, but just to be safe.
+        chunk_offset_in_buffer = 0;
+      }
+    }
+  }
+
+  // Create a new buffer only if current capacity is not sufficient, and memcopy
+  // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
+  if (buffer_.Capacity() < roundup_len) {
+    buffer_.Alignment(alignment);
+    buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
+                              copy_data_to_new_buffer, chunk_offset_in_buffer,
+                              static_cast<size_t>(chunk_len));
+  } else if (chunk_len > 0) {
+    // New buffer not needed. But memmove bytes from tail to the beginning since
+    // chunk_len is greater than 0.
+    buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer),
+                      static_cast<size_t>(chunk_len));
+  }
+
+  Slice result;
+  s = reader->Read(rounddown_offset + chunk_len,
+                   static_cast<size_t>(roundup_len - chunk_len), &result,
+                   buffer_.BufferStart() + chunk_len, for_compaction);
+  if (s.ok()) {
+    buffer_offset_ = rounddown_offset;
+    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
+  }
+  return s;
+}
+
+bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
+                                          Slice* result, bool for_compaction) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+  if (!enable_ || offset < buffer_offset_) {
+    return false;
+  }
+
+  // If the buffer contains only a few of the requested bytes:
+  //    If readahead is enabled: prefetch the remaining bytes + readadhead bytes
+  //        and satisfy the request.
+  //    If readahead is not enabled: return false.
+  if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
+    if (readahead_size_ > 0) {
+      assert(file_reader_ != nullptr);
+      assert(max_readahead_size_ >= readahead_size_);
+      Status s;
+      if (for_compaction) {
+        s = Prefetch(file_reader_, offset, std::max(n, readahead_size_),
+                     for_compaction);
+      } else {
+        s = Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
+      }
+      if (!s.ok()) {
+        return false;
+      }
+      readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+    } else {
+      return false;
+    }
+  }
+
+  uint64_t offset_in_buffer = offset - buffer_offset_;
+  *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
+  return true;
+}
+}  // namespace rocksdb
diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h
new file mode 100644
index 00000000000..c3cacf1020e
--- /dev/null
+++ b/file/file_prefetch_buffer.h
@@ -0,0 +1,97 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+#include "file/random_access_file_reader.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/aligned_buffer.h"
+
+namespace rocksdb {
+
+// FilePrefetchBuffer is a smart buffer to store and read data from a file.
+class FilePrefetchBuffer {
+ public:
+  // Constructor.
+  //
+  // All arguments are optional.
+  // file_reader        : the file reader to use. Can be a nullptr.
+  // readahead_size     : the initial readahead size.
+  // max_readahead_size : the maximum readahead size.
+  //   If max_readahead_size > readahead_size, the readahead size will be
+  //   doubled on every IO until max_readahead_size is hit.
+  //   Typically this is set as a multiple of readahead_size.
+  //   max_readahead_size should be greater than equal to readahead_size.
+  // enable : controls whether reading from the buffer is enabled.
+  //   If false, TryReadFromCache() always return false, and we only take stats
+  //   for the minimum offset if track_min_offset = true.
+  // track_min_offset : Track the minimum offset ever read and collect stats on
+  //   it. Used for adaptable readahead of the file footer/metadata.
+  //
+  // Automatic readhead is enabled for a file if file_reader, readahead_size,
+  // and max_readahead_size are passed in.
+  // If file_reader is a nullptr, setting readadhead_size and max_readahead_size
+  // does not make any sense. So it does nothing.
+  // A user can construct a FilePrefetchBuffer without any arguments, but use
+  // `Prefetch` to load data into the buffer.
+  FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
+                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false)
+      : buffer_offset_(0),
+        file_reader_(file_reader),
+        readahead_size_(readadhead_size),
+        max_readahead_size_(max_readahead_size),
+        min_offset_read_(port::kMaxSizet),
+        enable_(enable),
+        track_min_offset_(track_min_offset) {}
+
+  // Load data into the buffer from a file.
+  // reader : the file reader.
+  // offset : the file offset to start reading from.
+  // n      : the number of bytes to read.
+  // for_compaction : if prefetch is done for compaction read.
+  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n,
+                  bool for_compaction = false);
+
+  // Tries returning the data for a file raed from this buffer, if that data is
+  // in the buffer.
+  // It handles tracking the minimum read offset if track_min_offset = true.
+  // It also does the exponential readahead when readadhead_size is set as part
+  // of the constructor.
+  //
+  // offset : the file offset.
+  // n      : the number of bytes.
+  // result : output buffer to put the data into.
+  // for_compaction : if cache read is done for compaction read.
+  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result,
+                        bool for_compaction = false);
+
+  // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
+  // tracked if track_min_offset = true.
+  size_t min_offset_read() const { return min_offset_read_; }
+
+ private:
+  AlignedBuffer buffer_;
+  uint64_t buffer_offset_;
+  RandomAccessFileReader* file_reader_;
+  size_t readahead_size_;
+  size_t max_readahead_size_;
+  // The minimum `offset` ever passed to TryReadFromCache().
+  size_t min_offset_read_;
+  // if false, TryReadFromCache() always return false, and we only take stats
+  // for track_min_offset_ if track_min_offset_ = true
+  bool enable_;
+  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
+  // can be fetched from min_offset_read().
+  bool track_min_offset_;
+};
+}  // namespace rocksdb
diff --git a/util/file_util.cc b/file/file_util.cc
similarity index 79%
rename from util/file_util.cc
rename to file/file_util.cc
index ba1b4744bbb..f1bf6596ba6 100644
--- a/util/file_util.cc
+++ b/file/file_util.cc
@@ -3,14 +3,16 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "util/file_util.h"
+#include "file/file_util.h"
 
 #include <string>
 #include <algorithm>
 
+#include "file/random_access_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
-#include "util/sst_file_manager_impl.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -88,12 +90,12 @@ Status CreateFile(Env* env, const std::string& destination,
 }
 
 Status DeleteDBFile(const ImmutableDBOptions* db_options,
-                     const std::string& fname, const std::string& dir_to_sync,
-                     const bool force_bg) {
+                    const std::string& fname, const std::string& dir_to_sync,
+                    const bool force_bg, const bool force_fg) {
 #ifndef ROCKSDB_LITE
   SstFileManagerImpl* sfm =
       static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
-  if (sfm) {
+  if (sfm && !force_fg) {
     return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
   } else {
     return db_options->env->DeleteFile(fname);
@@ -101,10 +103,22 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
 #else
   (void)dir_to_sync;
   (void)force_bg;
+  (void)force_fg;
   // SstFileManager is not supported in ROCKSDB_LITE
   // Delete file immediately
   return db_options->env->DeleteFile(fname);
 #endif
 }
 
+bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) {
+  bool same = false;
+  assert(!db_options->db_paths.empty());
+  Status s = db_options->env->AreFilesSame(db_options->wal_dir,
+                                           db_options->db_paths[0].path, &same);
+  if (s.IsNotSupported()) {
+    same = db_options->wal_dir == db_options->db_paths[0].path;
+  }
+  return same;
+}
+
 }  // namespace rocksdb
diff --git a/util/file_util.h b/file/file_util.h
similarity index 81%
rename from util/file_util.h
rename to file/file_util.h
index c3b365c8bc3..75d6d7eb9fe 100644
--- a/util/file_util.h
+++ b/file/file_util.h
@@ -6,11 +6,11 @@
 #pragma once
 #include <string>
 
+#include "file/filename.h"
 #include "options/db_options.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 // use_fsync maps to options.use_fsync, which determines the way that
@@ -24,7 +24,9 @@ extern Status CreateFile(Env* env, const std::string& destination,
 
 extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
                            const std::string& fname,
-                           const std::string& path_to_sync,
-                           const bool force_bg = false);
+                           const std::string& path_to_sync, const bool force_bg,
+                           const bool force_fg);
+
+extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options);
 
 }  // namespace rocksdb
diff --git a/util/filename.cc b/file/filename.cc
similarity index 88%
rename from util/filename.cc
rename to file/filename.cc
index 32289aecb4b..5a3fa290226 100644
--- a/util/filename.cc
+++ b/file/filename.cc
@@ -6,22 +6,18 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include "util/filename.h"
-#include <inttypes.h>
+#include "file/filename.h"
+#include <cinttypes>
 
 #include <ctype.h>
 #include <stdio.h>
 #include <vector>
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
 #include "rocksdb/env.h"
-#include "util/file_reader_writer.h"
-#include "util/logging.h"
+#include "test_util/sync_point.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -61,13 +57,16 @@ static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) {
   return write_idx;
 }
 
+static std::string MakeFileName(uint64_t number, const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "%06llu.%s",
+           static_cast<unsigned long long>(number), suffix);
+  return buf;
+}
+
 static std::string MakeFileName(const std::string& name, uint64_t number,
                                 const char* suffix) {
-  char buf[100];
-  snprintf(buf, sizeof(buf), "/%06llu.%s",
-           static_cast<unsigned long long>(number),
-           suffix);
-  return name + buf;
+  return name + "/" + MakeFileName(number, suffix);
 }
 
 std::string LogFileName(const std::string& name, uint64_t number) {
@@ -75,6 +74,11 @@ std::string LogFileName(const std::string& name, uint64_t number) {
   return MakeFileName(name, number, "log");
 }
 
+std::string LogFileName(uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(number, "log");
+}
+
 std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
   assert(number > 0);
   return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
@@ -99,6 +103,10 @@ std::string MakeTableFileName(const std::string& path, uint64_t number) {
   return MakeFileName(path, number, kRocksDbTFileExt.c_str());
 }
 
+std::string MakeTableFileName(uint64_t number) {
+  return MakeFileName(number, kRocksDbTFileExt.c_str());
+}
+
 std::string Rocks2LevelTableFileName(const std::string& fullname) {
   assert(fullname.size() > kRocksDbTFileExt.size() + 1);
   if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
@@ -385,8 +393,14 @@ Status SetCurrentFile(Env* env, const std::string& dbname,
   return s;
 }
 
-Status SetIdentityFile(Env* env, const std::string& dbname) {
-  std::string id = env->GenerateUniqueId();
+Status SetIdentityFile(Env* env, const std::string& dbname,
+                       const std::string& db_id) {
+  std::string id;
+  if (db_id.empty()) {
+    id = env->GenerateUniqueId();
+  } else {
+    id = db_id;
+  }
   assert(!id.empty());
   // Reserve the filename dbname/000000.dbtmp for the temporary identity file
   std::string tmp = TempFileName(dbname, 0);
@@ -407,4 +421,36 @@ Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
   return file->Sync(db_options->use_fsync);
 }
 
+Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+                       const std::string& dbname, std::string* parent_dir,
+                       std::vector<std::string>* info_log_list) {
+  assert(parent_dir != nullptr);
+  assert(info_log_list != nullptr);
+  uint64_t number = 0;
+  FileType type = kLogFile;
+
+  if (!db_log_dir.empty()) {
+    *parent_dir = db_log_dir;
+  } else {
+    *parent_dir = dbname;
+  }
+
+  InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname);
+
+  std::vector<std::string> file_names;
+  Status s = env->GetChildren(*parent_dir, &file_names);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (auto& f : file_names) {
+    if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
+        (type == kInfoLogFile)) {
+      info_log_list->push_back(f);
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace rocksdb
diff --git a/util/filename.h b/file/filename.h
similarity index 91%
rename from util/filename.h
rename to file/filename.h
index eea6b1b02fd..ad19d389594 100644
--- a/util/filename.h
+++ b/file/filename.h
@@ -47,6 +47,8 @@ enum FileType {
 // "dbname".
 extern std::string LogFileName(const std::string& dbname, uint64_t number);
 
+extern std::string LogFileName(uint64_t number);
+
 extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
 
 extern std::string BlobFileName(const std::string& dbname,
@@ -63,6 +65,8 @@ extern std::string ArchivedLogFileName(const std::string& dbname,
 
 extern std::string MakeTableFileName(const std::string& name, uint64_t number);
 
+extern std::string MakeTableFileName(uint64_t number);
+
 // Return the name of sstable with LevelDB suffix
 // created from RocksDB sstable suffixed name
 extern std::string Rocks2LevelTableFileName(const std::string& fullname);
@@ -163,10 +167,19 @@ extern Status SetCurrentFile(Env* env, const std::string& dbname,
                              Directory* directory_to_fsync);
 
 // Make the IDENTITY file for the db
-extern Status SetIdentityFile(Env* env, const std::string& dbname);
+extern Status SetIdentityFile(Env* env, const std::string& dbname,
+                              const std::string& db_id = {});
 
 // Sync manifest file `file`.
 extern Status SyncManifest(Env* env, const ImmutableDBOptions* db_options,
                            WritableFileWriter* file);
 
+// Return list of file names of info logs in `file_names`.
+// The list only contains file name. The parent directory name is stored
+// in `parent_dir`.
+// `db_log_dir` should be the one as in options.db_log_dir
+extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+                              const std::string& dbname,
+                              std::string* parent_dir,
+                              std::vector<std::string>* file_names);
 }  // namespace rocksdb
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
new file mode 100644
index 00000000000..5b5a19ff862
--- /dev/null
+++ b/file/random_access_file_reader.cc
@@ -0,0 +1,188 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/random_access_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
+                                    char* scratch, bool for_compaction) const {
+  Status s;
+  uint64_t elapsed = 0;
+  {
+    StopWatch sw(env_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      size_t alignment = file_->GetRequiredBufferAlignment();
+      size_t aligned_offset =
+          TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
+      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
+      size_t read_size =
+          Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
+      AlignedBuffer buf;
+      buf.Alignment(alignment);
+      buf.AllocateNewBuffer(read_size);
+      while (buf.CurrentSize() < read_size) {
+        size_t allowed;
+        if (for_compaction && rate_limiter_ != nullptr) {
+          allowed = rate_limiter_->RequestToken(
+              buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
+              Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
+        } else {
+          assert(buf.CurrentSize() == 0);
+          allowed = read_size;
+        }
+        Slice tmp;
+
+        FileOperationInfo::TimePoint start_ts;
+        uint64_t orig_offset = 0;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+          orig_offset = aligned_offset + buf.CurrentSize();
+        }
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
+                          buf.Destination());
+        }
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                                 s);
+        }
+
+        buf.Size(buf.CurrentSize() + tmp.size());
+        if (!s.ok() || tmp.size() < allowed) {
+          break;
+        }
+      }
+      size_t res_len = 0;
+      if (s.ok() && offset_advance < buf.CurrentSize()) {
+        res_len = buf.Read(scratch, offset_advance,
+                           std::min(buf.CurrentSize() - offset_advance, n));
+      }
+      *result = Slice(scratch, res_len);
+#endif  // !ROCKSDB_LITE
+    } else {
+      size_t pos = 0;
+      const char* res_scratch = nullptr;
+      while (pos < n) {
+        size_t allowed;
+        if (for_compaction && rate_limiter_ != nullptr) {
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStart();
+          }
+          allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
+                                                Env::IOPriority::IO_LOW, stats_,
+                                                RateLimiter::OpType::kRead);
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStop();
+          }
+        } else {
+          allowed = n;
+        }
+        Slice tmp_result;
+
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::TimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+        }
+#endif
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
+        }
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+                                 finish_ts, s);
+        }
+#endif
+
+        if (res_scratch == nullptr) {
+          // we can't simply use `scratch` because reads of mmap'd files return
+          // data in a different buffer.
+          res_scratch = tmp_result.data();
+        } else {
+          // make sure chunks are inserted contiguously into `res_scratch`.
+          assert(tmp_result.data() == res_scratch + pos);
+        }
+        pos += tmp_result.size();
+        if (!s.ok() || tmp_result.size() < allowed) {
+          break;
+        }
+      }
+      *result = Slice(res_scratch, s.ok() ? pos : 0);
+    }
+    IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return s;
+}
+
+Status RandomAccessFileReader::MultiRead(ReadRequest* read_reqs,
+                                         size_t num_reqs) const {
+  Status s;
+  uint64_t elapsed = 0;
+  assert(!use_direct_io());
+  {
+    StopWatch sw(env_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                 true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
+    IOSTATS_TIMER_GUARD(read_nanos);
+
+#ifndef ROCKSDB_LITE
+    FileOperationInfo::TimePoint start_ts;
+    if (ShouldNotifyListeners()) {
+      start_ts = std::chrono::system_clock::now();
+    }
+#endif  // ROCKSDB_LITE
+    {
+      IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+      s = file_->MultiRead(read_reqs, num_reqs);
+    }
+    for (size_t i = 0; i < num_reqs; ++i) {
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(),
+                               start_ts, finish_ts, read_reqs[i].status);
+      }
+#endif  // ROCKSDB_LITE
+      IOSTATS_ADD_IF_POSITIVE(bytes_read, read_reqs[i].result.size());
+    }
+    SetPerfLevel(prev_perf_level);
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+
+  return s;
+}
+}  // namespace rocksdb
diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h
new file mode 100644
index 00000000000..abbc71ff112
--- /dev/null
+++ b/file/random_access_file_reader.h
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <sstream>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/aligned_buffer.h"
+
+namespace rocksdb {
+
+class Statistics;
+class HistogramImpl;
+
+// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
+// responsible for:
+// - Handling Buffered and Direct reads appropriately.
+// - Rate limiting compaction reads.
+// - Notifying any interested listeners on the completion of a read.
+// - Updating IO stats.
+class RandomAccessFileReader {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
+                              const FileOperationInfo::TimePoint& start_ts,
+                              const FileOperationInfo::TimePoint& finish_ts,
+                              const Status& status) const {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  std::unique_ptr<RandomAccessFile> file_;
+  std::string file_name_;
+  Env* env_;
+  Statistics* stats_;
+  uint32_t hist_type_;
+  HistogramImpl* file_read_hist_;
+  RateLimiter* rate_limiter_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+
+ public:
+  explicit RandomAccessFileReader(
+      std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name,
+      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
+      HistogramImpl* file_read_hist = nullptr,
+      RateLimiter* rate_limiter = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+      : file_(std::move(raf)),
+        file_name_(std::move(_file_name)),
+        env_(env),
+        stats_(stats),
+        hist_type_(hist_type),
+        file_read_hist_(file_read_hist),
+        rate_limiter_(rate_limiter),
+        listeners_() {
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  RandomAccessFileReader& operator=(RandomAccessFileReader&& o)
+      ROCKSDB_NOEXCEPT {
+    file_ = std::move(o.file_);
+    env_ = std::move(o.env_);
+    stats_ = std::move(o.stats_);
+    hist_type_ = std::move(o.hist_type_);
+    file_read_hist_ = std::move(o.file_read_hist_);
+    rate_limiter_ = std::move(o.rate_limiter_);
+    return *this;
+  }
+
+  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
+  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
+              bool for_compaction = false) const;
+
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) const;
+
+  Status Prefetch(uint64_t offset, size_t n) const {
+    return file_->Prefetch(offset, n);
+  }
+
+  RandomAccessFile* file() { return file_.get(); }
+
+  std::string file_name() const { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+};
+}  // namespace rocksdb
diff --git a/file/read_write_util.cc b/file/read_write_util.cc
new file mode 100644
index 00000000000..892499b8cfa
--- /dev/null
+++ b/file/read_write_util.cc
@@ -0,0 +1,66 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/read_write_util.h"
+
+#include <sstream>
+#include "test_util/sync_point.h"
+
+namespace rocksdb {
+Status NewWritableFile(Env* env, const std::string& fname,
+                       std::unique_ptr<WritableFile>* result,
+                       const EnvOptions& options) {
+  Status s = env->NewWritableFile(fname, result, options);
+  TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+  return s;
+}
+
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result) {
+  const int kBufferSize = 8192;
+  char buffer[kBufferSize + 1];
+  Slice input_slice;
+
+  std::string line;
+  bool has_complete_line = false;
+  while (!has_complete_line) {
+    if (std::getline(*iss, line)) {
+      has_complete_line = !iss->eof();
+    } else {
+      has_complete_line = false;
+    }
+    if (!has_complete_line) {
+      // if we're not sure whether we have a complete line,
+      // further read from the file.
+      if (*has_data) {
+        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
+      }
+      if (input_slice.size() == 0) {
+        // meaning we have read all the data
+        *has_data = false;
+        break;
+      } else {
+        iss->str(line + input_slice.ToString());
+        // reset the internal state of iss so that we can keep reading it.
+        iss->clear();
+        *has_data = (input_slice.size() == kBufferSize);
+        continue;
+      }
+    }
+  }
+  *output = line;
+  return *has_data || has_complete_line;
+}
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size) {
+  return off % sector_size == 0;
+}
+#endif  // NDEBUG
+}  // namespace rocksdb
diff --git a/file/read_write_util.h b/file/read_write_util.h
new file mode 100644
index 00000000000..be975e854ff
--- /dev/null
+++ b/file/read_write_util.h
@@ -0,0 +1,32 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+// Returns a WritableFile.
+//
+// env     : the Env.
+// fname   : the file name.
+// result  : output arg. A WritableFile based on `fname` returned.
+// options : the Env Options.
+extern Status NewWritableFile(Env* env, const std::string& fname,
+                              std::unique_ptr<WritableFile>* result,
+                              const EnvOptions& options);
+
+// Read a single line from a file.
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result);
+
+#ifndef NDEBUG
+bool IsFileSectorAligned(const size_t off, size_t sector_size);
+#endif  // NDEBUG
+}  // namespace rocksdb
diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc
new file mode 100644
index 00000000000..dc005b900b6
--- /dev/null
+++ b/file/readahead_raf.cc
@@ -0,0 +1,162 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/readahead_raf.h"
+
+#include <algorithm>
+#include <mutex>
+#include "file/read_write_util.h"
+#include "util/aligned_buffer.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+namespace {
+class ReadaheadRandomAccessFile : public RandomAccessFile {
+ public:
+  ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
+                            size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
+
+  ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) =
+      delete;
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      return file_->Read(offset, n, result, scratch);
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(offset, n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return Status::OK();
+    }
+    size_t advanced_offset = static_cast<size_t>(offset + cached_len);
+    // In the case of cache hit advanced_offset is already aligned, means that
+    // chunk_offset equals to advanced_offset
+    size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
+
+    Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(advanced_offset, n - cached_len, &remaining_len,
+                       scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  Status Prefetch(uint64_t offset, size_t n) override {
+    if (n < readahead_size_) {
+      // Don't allow smaller prefetches than the configured `readahead_size_`.
+      // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
+      return Status::OK();
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t offset_ = static_cast<size_t>(offset);
+    size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
+    if (prefetch_offset == buffer_offset_) {
+      return Status::OK();
+    }
+    return ReadIntoBuffer(prefetch_offset,
+                          Roundup(offset_ + n, alignment_) - prefetch_offset);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return file_->GetUniqueId(id, max_size);
+  }
+
+  void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes starting at offset. If anything was read
+  // from the cache, it sets cached_len to the number of bytes actually read,
+  // copies these number of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
+                        char* scratch) const {
+    if (offset < buffer_offset_ ||
+        offset >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = offset - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_ starting at offset.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  Status ReadIntoBuffer(uint64_t offset, size_t n) const {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(offset, alignment_));
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
+    if (s.ok()) {
+      buffer_offset_ = offset;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<RandomAccessFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  mutable std::mutex lock_;
+  // The buffer storing the prefetched data
+  mutable AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  mutable uint64_t buffer_offset_;
+};
+}  // namespace
+
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
+  std::unique_ptr<RandomAccessFile> result(
+      new ReadaheadRandomAccessFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace rocksdb
diff --git a/file/readahead_raf.h b/file/readahead_raf.h
new file mode 100644
index 00000000000..f6d64e77ac5
--- /dev/null
+++ b/file/readahead_raf.h
@@ -0,0 +1,27 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+// This file provides the following main abstractions:
+// SequentialFileReader : wrapper over Env::SequentialFile
+// RandomAccessFileReader : wrapper over Env::RandomAccessFile
+// WritableFileWriter : wrapper over Env::WritableFile
+// In addition, it also exposed NewReadaheadRandomAccessFile, NewWritableFile,
+// and ReadOneLine primitives.
+
+// NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
+// always prefetch additional data with every read. This is mainly used in
+// Compaction Table Readers.
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
+}  // namespace rocksdb
diff --git a/file/sequence_file_reader.cc b/file/sequence_file_reader.cc
new file mode 100644
index 00000000000..be766a68b38
--- /dev/null
+++ b/file/sequence_file_reader.cc
@@ -0,0 +1,233 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/sequence_file_reader.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "file/read_write_util.h"
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
+  Status s;
+  if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+    size_t offset = offset_.fetch_add(n);
+    size_t alignment = file_->GetRequiredBufferAlignment();
+    size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
+    size_t offset_advance = offset - aligned_offset;
+    size_t size = Roundup(offset + n, alignment) - aligned_offset;
+    size_t r = 0;
+    AlignedBuffer buf;
+    buf.Alignment(alignment);
+    buf.AllocateNewBuffer(size);
+    Slice tmp;
+    s = file_->PositionedRead(aligned_offset, size, &tmp, buf.BufferStart());
+    if (s.ok() && offset_advance < tmp.size()) {
+      buf.Size(tmp.size());
+      r = buf.Read(scratch, offset_advance,
+                   std::min(tmp.size() - offset_advance, n));
+    }
+    *result = Slice(scratch, r);
+#endif  // !ROCKSDB_LITE
+  } else {
+    s = file_->Read(n, result, scratch);
+  }
+  IOSTATS_ADD(bytes_read, result->size());
+  return s;
+}
+
+Status SequentialFileReader::Skip(uint64_t n) {
+#ifndef ROCKSDB_LITE
+  if (use_direct_io()) {
+    offset_ += static_cast<size_t>(n);
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+  return file_->Skip(n);
+}
+
+namespace {
+// This class wraps a SequentialFile, exposing same API, with the differenece
+// of being able to prefetch up to readahead_size bytes and then serve them
+// from memory, avoiding the entire round-trip if, for example, the data for the
+// file is actually remote.
+class ReadaheadSequentialFile : public SequentialFile {
+ public:
+  ReadaheadSequentialFile(std::unique_ptr<SequentialFile>&& file,
+                          size_t readahead_size)
+      : file_(std::move(file)),
+        alignment_(file_->GetRequiredBufferAlignment()),
+        readahead_size_(Roundup(readahead_size, alignment_)),
+        buffer_(),
+        buffer_offset_(0),
+        read_offset_(0) {
+    buffer_.Alignment(alignment_);
+    buffer_.AllocateNewBuffer(readahead_size_);
+  }
+
+  ReadaheadSequentialFile(const ReadaheadSequentialFile&) = delete;
+
+  ReadaheadSequentialFile& operator=(const ReadaheadSequentialFile&) = delete;
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t cached_len = 0;
+    // Check if there is a cache hit, meaning that [offset, offset + n) is
+    // either completely or partially in the buffer. If it's completely cached,
+    // including end of file case when offset + n is greater than EOF, then
+    // return.
+    if (TryReadFromCache(n, &cached_len, scratch) &&
+        (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
+      // We read exactly what we needed, or we hit end of file - return.
+      *result = Slice(scratch, cached_len);
+      return Status::OK();
+    }
+    n -= cached_len;
+
+    Status s;
+    // Read-ahead only make sense if we have some slack left after reading
+    if (n + alignment_ >= readahead_size_) {
+      s = file_->Read(n, result, scratch + cached_len);
+      if (s.ok()) {
+        read_offset_ += result->size();
+        *result = Slice(scratch, cached_len + result->size());
+      }
+      buffer_.Clear();
+      return s;
+    }
+
+    s = ReadIntoBuffer(readahead_size_);
+    if (s.ok()) {
+      // The data we need is now in cache, so we can safely read it
+      size_t remaining_len;
+      TryReadFromCache(n, &remaining_len, scratch + cached_len);
+      *result = Slice(scratch, cached_len + remaining_len);
+    }
+    return s;
+  }
+
+  Status Skip(uint64_t n) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    Status s = Status::OK();
+    // First check if we need to skip already cached data
+    if (buffer_.CurrentSize() > 0) {
+      // Do we need to skip beyond cached data?
+      if (read_offset_ + n >= buffer_offset_ + buffer_.CurrentSize()) {
+        // Yes. Skip whaterver is in memory and adjust offset accordingly
+        n -= buffer_offset_ + buffer_.CurrentSize() - read_offset_;
+        read_offset_ = buffer_offset_ + buffer_.CurrentSize();
+      } else {
+        // No. The entire section to be skipped is entirely i cache.
+        read_offset_ += n;
+        n = 0;
+      }
+    }
+    if (n > 0) {
+      // We still need to skip more, so call the file API for skipping
+      s = file_->Skip(n);
+      if (s.ok()) {
+        read_offset_ += n;
+      }
+      buffer_.Clear();
+    }
+    return s;
+  }
+
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
+    return file_->PositionedRead(offset, n, result, scratch);
+  }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.Clear();
+    return file_->InvalidateCache(offset, length);
+  }
+
+  bool use_direct_io() const override { return file_->use_direct_io(); }
+
+ private:
+  // Tries to read from buffer_ n bytes. If anything was read from the cache, it
+  // sets cached_len to the number of bytes actually read, copies these number
+  // of bytes to scratch and returns true.
+  // If nothing was read sets cached_len to 0 and returns false.
+  bool TryReadFromCache(size_t n, size_t* cached_len, char* scratch) {
+    if (read_offset_ < buffer_offset_ ||
+        read_offset_ >= buffer_offset_ + buffer_.CurrentSize()) {
+      *cached_len = 0;
+      return false;
+    }
+    uint64_t offset_in_buffer = read_offset_ - buffer_offset_;
+    *cached_len = std::min(
+        buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+    read_offset_ += *cached_len;
+    return true;
+  }
+
+  // Reads into buffer_ the next n bytes from file_.
+  // Can actually read less if EOF was reached.
+  // Returns the status of the read operastion on the file.
+  Status ReadIntoBuffer(size_t n) {
+    if (n > buffer_.Capacity()) {
+      n = buffer_.Capacity();
+    }
+    assert(IsFileSectorAligned(n, alignment_));
+    Slice result;
+    Status s = file_->Read(n, &result, buffer_.BufferStart());
+    if (s.ok()) {
+      buffer_offset_ = read_offset_;
+      buffer_.Size(result.size());
+      assert(result.size() == 0 || buffer_.BufferStart() == result.data());
+    }
+    return s;
+  }
+
+  const std::unique_ptr<SequentialFile> file_;
+  const size_t alignment_;
+  const size_t readahead_size_;
+
+  std::mutex lock_;
+  // The buffer storing the prefetched data
+  AlignedBuffer buffer_;
+  // The offset in file_, corresponding to data stored in buffer_
+  uint64_t buffer_offset_;
+  // The offset up to which data was read from file_. In fact, it can be larger
+  // than the actual file size, since the file_->Skip(n) call doesn't return the
+  // actual number of bytes that were skipped, which can be less than n.
+  // This is not a problemm since read_offset_ is monotonically increasing and
+  // its only use is to figure out if next piece of data should be read from
+  // buffer_ or file_ directly.
+  uint64_t read_offset_;
+};
+}  // namespace
+
+std::unique_ptr<SequentialFile>
+SequentialFileReader::NewReadaheadSequentialFile(
+    std::unique_ptr<SequentialFile>&& file, size_t readahead_size) {
+  if (file->GetRequiredBufferAlignment() >= readahead_size) {
+    // Short-circuit and return the original file if readahead_size is
+    // too small and hence doesn't make sense to be used for prefetching.
+    return std::move(file);
+  }
+  std::unique_ptr<SequentialFile> result(
+      new ReadaheadSequentialFile(std::move(file), readahead_size));
+  return result;
+}
+}  // namespace rocksdb
diff --git a/file/sequence_file_reader.h b/file/sequence_file_reader.h
new file mode 100644
index 00000000000..6a6350e1d69
--- /dev/null
+++ b/file/sequence_file_reader.h
@@ -0,0 +1,66 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+// SequentialFileReader is a wrapper on top of Env::SequentialFile. It handles
+// Buffered (i.e when page cache is enabled) and Direct (with O_DIRECT / page
+// cache disabled) reads appropriately, and also updates the IO stats.
+class SequentialFileReader {
+ private:
+  std::unique_ptr<SequentialFile> file_;
+  std::string file_name_;
+  std::atomic<size_t> offset_{0};  // read offset
+
+ public:
+  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
+                                const std::string& _file_name)
+      : file_(std::move(_file)), file_name_(_file_name) {}
+
+  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
+                                const std::string& _file_name,
+                                size_t _readahead_size)
+      : file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size)),
+        file_name_(_file_name) {}
+
+  SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    file_ = std::move(o.file_);
+    return *this;
+  }
+
+  SequentialFileReader(const SequentialFileReader&) = delete;
+  SequentialFileReader& operator=(const SequentialFileReader&) = delete;
+
+  Status Read(size_t n, Slice* result, char* scratch);
+
+  Status Skip(uint64_t n);
+
+  SequentialFile* file() { return file_.get(); }
+
+  std::string file_name() { return file_name_; }
+
+  bool use_direct_io() const { return file_->use_direct_io(); }
+
+ private:
+  // NewReadaheadSequentialFile provides a wrapper over SequentialFile to
+  // always prefetch additional data with every read.
+  static std::unique_ptr<SequentialFile> NewReadaheadSequentialFile(
+      std::unique_ptr<SequentialFile>&& file, size_t readahead_size);
+};
+}  // namespace rocksdb
diff --git a/util/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc
similarity index 98%
rename from util/sst_file_manager_impl.cc
rename to file/sst_file_manager_impl.cc
index 6a770b106e8..08ea873258a 100644
--- a/util/sst_file_manager_impl.cc
+++ b/file/sst_file_manager_impl.cc
@@ -3,21 +3,17 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/sst_file_manager_impl.h"
+#include "file/sst_file_manager_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
+#include "test_util/sync_point.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -264,8 +260,11 @@ void SstFileManagerImpl::ClearError() {
       return;
     }
 
-    uint64_t free_space;
+    uint64_t free_space = 0;
     Status s = env_->GetFreeSpace(path_, &free_space);
+    free_space = max_allowed_space_ > 0
+                     ? std::min(max_allowed_space_, free_space)
+                     : free_space;
     if (s.ok()) {
       // In case of multi-DB instances, some of them may have experienced a
       // soft error and some a hard error. In the SstFileManagerImpl, a hard
diff --git a/util/sst_file_manager_impl.h b/file/sst_file_manager_impl.h
similarity index 99%
rename from util/sst_file_manager_impl.h
rename to file/sst_file_manager_impl.h
index 211b4fa7160..89304227807 100644
--- a/util/sst_file_manager_impl.h
+++ b/file/sst_file_manager_impl.h
@@ -11,10 +11,10 @@
 
 #include "port/port.h"
 
-#include "db/compaction.h"
+#include "db/compaction/compaction.h"
 #include "db/error_handler.h"
+#include "file/delete_scheduler.h"
 #include "rocksdb/sst_file_manager.h"
-#include "util/delete_scheduler.h"
 
 namespace rocksdb {
 
diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc
new file mode 100644
index 00000000000..277e55500c1
--- /dev/null
+++ b/file/writable_file_writer.cc
@@ -0,0 +1,405 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/writable_file_writer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "monitoring/histogram.h"
+#include "monitoring/iostats_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+
+namespace rocksdb {
+Status WritableFileWriter::Append(const Slice& data) {
+  const char* src = data.data();
+  size_t left = data.size();
+  Status s;
+  pending_sync_ = true;
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:0",
+                   rocksdb_kill_odds * REDUCE_ODDS2);
+
+  {
+    IOSTATS_TIMER_GUARD(prepare_write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
+    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
+  }
+
+  // See whether we need to enlarge the buffer to avoid the flush
+  if (buf_.Capacity() - buf_.CurrentSize() < left) {
+    for (size_t cap = buf_.Capacity();
+         cap < max_buffer_size_;  // There is still room to increase
+         cap *= 2) {
+      // See whether the next available size is large enough.
+      // Buffer will never be increased to more than max_buffer_size_.
+      size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
+      if (desired_capacity - buf_.CurrentSize() >= left ||
+          (use_direct_io() && desired_capacity == max_buffer_size_)) {
+        buf_.AllocateNewBuffer(desired_capacity, true);
+        break;
+      }
+    }
+  }
+
+  // Flush only when buffered I/O
+  if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
+    if (buf_.CurrentSize() > 0) {
+      s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    assert(buf_.CurrentSize() == 0);
+  }
+
+  // We never write directly to disk with direct I/O on.
+  // or we simply use it for its original purpose to accumulate many small
+  // chunks
+  if (use_direct_io() || (buf_.Capacity() >= left)) {
+    while (left > 0) {
+      size_t appended = buf_.Append(src, left);
+      left -= appended;
+      src += appended;
+
+      if (left > 0) {
+        s = Flush();
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+  } else {
+    // Writing directly to file bypassing the buffer
+    assert(buf_.CurrentSize() == 0);
+    s = WriteBuffered(src, left);
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
+  if (s.ok()) {
+    filesize_ += data.size();
+  }
+  return s;
+}
+
+Status WritableFileWriter::Pad(const size_t pad_bytes) {
+  assert(pad_bytes < kDefaultPageSize);
+  size_t left = pad_bytes;
+  size_t cap = buf_.Capacity() - buf_.CurrentSize();
+
+  // Assume pad_bytes is small compared to buf_ capacity. So we always
+  // use buf_ rather than write directly to file in certain cases like
+  // Append() does.
+  while (left) {
+    size_t append_bytes = std::min(cap, left);
+    buf_.PadWith(append_bytes, 0);
+    left -= append_bytes;
+    if (left > 0) {
+      Status s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    cap = buf_.Capacity() - buf_.CurrentSize();
+  }
+  pending_sync_ = true;
+  filesize_ += pad_bytes;
+  return Status::OK();
+}
+
+Status WritableFileWriter::Close() {
+  // Do not quit immediately on failure the file MUST be closed
+  Status s;
+
+  // Possible to close it twice now as we MUST close
+  // in __dtor, simply flushing is not enough
+  // Windows when pre-allocating does not fill with zeros
+  // also with unbuffered access we also set the end of data.
+  if (!writable_file_) {
+    return s;
+  }
+
+  s = Flush();  // flush cache to OS
+
+  Status interim;
+  // In direct I/O mode we write whole pages so
+  // we need to let the file know where data ends.
+  if (use_direct_io()) {
+    interim = writable_file_->Truncate(filesize_);
+    if (interim.ok()) {
+      interim = writable_file_->Fsync();
+    }
+    if (!interim.ok() && s.ok()) {
+      s = interim;
+    }
+  }
+
+  TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
+  interim = writable_file_->Close();
+  if (!interim.ok() && s.ok()) {
+    s = interim;
+  }
+
+  writable_file_.reset();
+  TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
+
+  return s;
+}
+
+// write out the cached data to the OS cache or storage if direct I/O
+// enabled
+Status WritableFileWriter::Flush() {
+  Status s;
+  TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
+                   rocksdb_kill_odds * REDUCE_ODDS2);
+
+  if (buf_.CurrentSize() > 0) {
+    if (use_direct_io()) {
+#ifndef ROCKSDB_LITE
+      if (pending_sync_) {
+        s = WriteDirect();
+      }
+#endif  // !ROCKSDB_LITE
+    } else {
+      s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  s = writable_file_->Flush();
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // sync OS cache to disk for every bytes_per_sync_
+  // TODO: give log file and sst file different options (log
+  // files could be potentially cached in OS for their whole
+  // life time, thus we might not want to flush at all).
+
+  // We try to avoid sync to the last 1MB of data. For two reasons:
+  // (1) avoid rewrite the same page that is modified later.
+  // (2) for older version of OS, write can block while writing out
+  //     the page.
+  // Xfs does neighbor page flushing outside of the specified ranges. We
+  // need to make sure sync range is far from the write offset.
+  if (!use_direct_io() && bytes_per_sync_) {
+    const uint64_t kBytesNotSyncRange =
+        1024 * 1024;                                // recent 1MB is not synced.
+    const uint64_t kBytesAlignWhenSync = 4 * 1024;  // Align 4KB.
+    if (filesize_ > kBytesNotSyncRange) {
+      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
+      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
+      assert(offset_sync_to >= last_sync_size_);
+      if (offset_sync_to > 0 &&
+          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
+        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
+        last_sync_size_ = offset_sync_to;
+      }
+    }
+  }
+
+  return s;
+}
+
+Status WritableFileWriter::Sync(bool use_fsync) {
+  Status s = Flush();
+  if (!s.ok()) {
+    return s;
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
+  if (!use_direct_io() && pending_sync_) {
+    s = SyncInternal(use_fsync);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
+  pending_sync_ = false;
+  return Status::OK();
+}
+
+Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
+  if (!writable_file_->IsSyncThreadSafe()) {
+    return Status::NotSupported(
+        "Can't WritableFileWriter::SyncWithoutFlush() because "
+        "WritableFile::IsSyncThreadSafe() is false");
+  }
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
+  Status s = SyncInternal(use_fsync);
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
+  return s;
+}
+
+Status WritableFileWriter::SyncInternal(bool use_fsync) {
+  Status s;
+  IOSTATS_TIMER_GUARD(fsync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
+  auto prev_perf_level = GetPerfLevel();
+  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+  if (use_fsync) {
+    s = writable_file_->Fsync();
+  } else {
+    s = writable_file_->Sync();
+  }
+  SetPerfLevel(prev_perf_level);
+  return s;
+}
+
+Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
+  IOSTATS_TIMER_GUARD(range_sync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
+  return writable_file_->RangeSync(offset, nbytes);
+}
+
+// This method writes to disk the specified data and makes use of the rate
+// limiter if available
+Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
+  Status s;
+  assert(!use_direct_io());
+  const char* src = data;
+  size_t left = size;
+
+  while (left > 0) {
+    size_t allowed;
+    if (rate_limiter_ != nullptr) {
+      allowed = rate_limiter_->RequestToken(
+          left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
+          RateLimiter::OpType::kWrite);
+    } else {
+      allowed = left;
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::TimePoint start_ts;
+      uint64_t old_size = writable_file_->GetFileSize();
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+        old_size = next_write_offset_;
+      }
+#endif
+      {
+        auto prev_perf_level = GetPerfLevel();
+        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+        s = writable_file_->Append(Slice(src, allowed));
+        SetPerfLevel(prev_perf_level);
+      }
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+      }
+#endif
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, allowed);
+    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
+
+    left -= allowed;
+    src += allowed;
+  }
+  buf_.Size(0);
+  return s;
+}
+
+// This flushes the accumulated data in the buffer. We pad data with zeros if
+// necessary to the whole page.
+// However, during automatic flushes padding would not be necessary.
+// We always use RateLimiter if available. We move (Refit) any buffer bytes
+// that are left over the
+// whole number of pages to be written again on the next flush because we can
+// only write on aligned
+// offsets.
+#ifndef ROCKSDB_LITE
+Status WritableFileWriter::WriteDirect() {
+  assert(use_direct_io());
+  Status s;
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  size_t file_advance = TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write
+  // it again in the future either on Close() OR when the current whole page
+  // fills out
+  size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up and pad
+  buf_.PadToAlignmentWith(0);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+
+  while (left > 0) {
+    // Check how much is allowed
+    size_t size;
+    if (rate_limiter_ != nullptr) {
+      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
+                                         writable_file_->GetIOPriority(),
+                                         stats_, RateLimiter::OpType::kWrite);
+    } else {
+      size = left;
+    }
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      FileOperationInfo::TimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+      }
+      // direct writes must be positional
+      s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+      }
+      if (!s.ok()) {
+        buf_.Size(file_advance + leftover_tail);
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, size);
+    left -= size;
+    src += size;
+    write_offset += size;
+    assert((next_write_offset_ % alignment) == 0);
+  }
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close()
+    buf_.RefitTail(file_advance, leftover_tail);
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  }
+  return s;
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h
new file mode 100644
index 00000000000..09d612233af
--- /dev/null
+++ b/file/writable_file_writer.h
@@ -0,0 +1,155 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/rate_limiter.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+namespace rocksdb {
+
+class Statistics;
+
+// WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
+// facilities to:
+// - Handle Buffered and Direct writes.
+// - Rate limit writes.
+// - Flush and Sync the data to the underlying filesystem.
+// - Notify any interested listeners on the completion of a write.
+// - Update IO stats.
+class WritableFileWriter {
+ private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
+                               const FileOperationInfo::TimePoint& start_ts,
+                               const FileOperationInfo::TimePoint& finish_ts,
+                               const Status& status) {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileWriteFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
+  std::unique_ptr<WritableFile> writable_file_;
+  std::string file_name_;
+  Env* env_;
+  AlignedBuffer buf_;
+  size_t max_buffer_size_;
+  // Actually written data size can be used for truncate
+  // not counting padding data
+  uint64_t filesize_;
+#ifndef ROCKSDB_LITE
+  // This is necessary when we use unbuffered access
+  // and writes must happen on aligned offsets
+  // so we need to go back and write that page again
+  uint64_t next_write_offset_;
+#endif  // ROCKSDB_LITE
+  bool pending_sync_;
+  uint64_t last_sync_size_;
+  uint64_t bytes_per_sync_;
+  RateLimiter* rate_limiter_;
+  Statistics* stats_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+
+ public:
+  WritableFileWriter(
+      std::unique_ptr<WritableFile>&& file, const std::string& _file_name,
+      const EnvOptions& options, Env* env = nullptr,
+      Statistics* stats = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
+      : writable_file_(std::move(file)),
+        file_name_(_file_name),
+        env_(env),
+        buf_(),
+        max_buffer_size_(options.writable_file_max_buffer_size),
+        filesize_(0),
+#ifndef ROCKSDB_LITE
+        next_write_offset_(0),
+#endif  // ROCKSDB_LITE
+        pending_sync_(false),
+        last_sync_size_(0),
+        bytes_per_sync_(options.bytes_per_sync),
+        rate_limiter_(options.rate_limiter),
+        stats_(stats),
+        listeners_() {
+    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
+                             reinterpret_cast<void*>(max_buffer_size_));
+    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
+    buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
+
+  WritableFileWriter(const WritableFileWriter&) = delete;
+
+  WritableFileWriter& operator=(const WritableFileWriter&) = delete;
+
+  ~WritableFileWriter() { Close(); }
+
+  std::string file_name() const { return file_name_; }
+
+  Status Append(const Slice& data);
+
+  Status Pad(const size_t pad_bytes);
+
+  Status Flush();
+
+  Status Close();
+
+  Status Sync(bool use_fsync);
+
+  // Sync only the data that was already Flush()ed. Safe to call concurrently
+  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
+  // returns NotSupported status.
+  Status SyncWithoutFlush(bool use_fsync);
+
+  uint64_t GetFileSize() const { return filesize_; }
+
+  Status InvalidateCache(size_t offset, size_t length) {
+    return writable_file_->InvalidateCache(offset, length);
+  }
+
+  WritableFile* writable_file() const { return writable_file_.get(); }
+
+  bool use_direct_io() { return writable_file_->use_direct_io(); }
+
+  bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }
+
+ private:
+  // Used when os buffering is OFF and we are writing
+  // DMA such as in Direct I/O mode
+#ifndef ROCKSDB_LITE
+  Status WriteDirect();
+#endif  // !ROCKSDB_LITE
+  // Normal write
+  Status WriteBuffered(const char* data, size_t size);
+  Status RangeSync(uint64_t offset, uint64_t nbytes);
+  Status SyncInternal(bool use_fsync);
+};
+}  // namespace rocksdb
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index c88a6c17df2..d4e986a110a 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -175,11 +175,26 @@ struct AdvancedColumnFamilyOptions {
   // individual write buffers.  Default: 1
   int min_write_buffer_number_to_merge = 1;
 
+  // DEPRECATED
   // The total maximum number of write buffers to maintain in memory including
   // copies of buffers that have already been flushed.  Unlike
   // max_write_buffer_number, this parameter does not affect flushing.
-  // This controls the minimum amount of write history that will be available
-  // in memory for conflict checking when Transactions are used.
+  // This parameter is being replaced by max_write_buffer_size_to_maintain.
+  // If both parameters are set to non-zero values, this parameter will be
+  // ignored.
+  int max_write_buffer_number_to_maintain = 0;
+
+  // The total maximum size(bytes) of write buffers to maintain in memory
+  // including copies of buffers that have already been flushed. This parameter
+  // only affects trimming of flushed buffers and does not affect flushing.
+  // This controls the maximum amount of write history that will be available
+  // in memory for conflict checking when Transactions are used. The actual
+  // size of write history (flushed Memtables) might be higher than this limit
+  // if further trimming will reduce write history total size below this
+  // limit. For example, if max_write_buffer_size_to_maintain is set to 64MB,
+  // and there are three flushed Memtables, with sizes of 32MB, 20MB, 20MB.
+  // Because trimming the next Memtable of size 20MB will reduce total memory
+  // usage to 52MB which is below the limit, RocksDB will stop trimming.
   //
   // When using an OptimisticTransactionDB:
   // If this value is too low, some transactions may fail at commit time due
@@ -192,14 +207,14 @@ struct AdvancedColumnFamilyOptions {
   // done for conflict detection.
   //
   // Setting this value to 0 will cause write buffers to be freed immediately
-  // after they are flushed.
-  // If this value is set to -1, 'max_write_buffer_number' will be used.
+  // after they are flushed. If this value is set to -1,
+  // 'max_write_buffer_number * write_buffer_size' will be used.
   //
   // Default:
   // If using a TransactionDB/OptimisticTransactionDB, the default value will
-  // be set to the value of 'max_write_buffer_number' if it is not explicitly
-  // set by the user.  Otherwise, the default is 0.
-  int max_write_buffer_number_to_maintain = 0;
+  // be set to the value of 'max_write_buffer_number * write_buffer_size'
+  // if it is not explicitly set by the user.  Otherwise, the default is 0.
+  int64_t max_write_buffer_size_to_maintain = 0;
 
   // Allows thread-safe inplace updates. If this is true, there is no way to
   // achieve point-in-time consistency using snapshot or iterator (assuming
@@ -632,17 +647,22 @@ struct AdvancedColumnFamilyOptions {
   bool report_bg_io_stats = false;
 
   // Files older than TTL will go through the compaction process.
-  // Supported in Level and FIFO compaction.
   // Pre-req: This needs max_open_files to be set to -1.
   // In Level: Non-bottom-level files older than TTL will go through the
   //           compation process.
   // In FIFO: Files older than TTL will be deleted.
   // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
+  // In FIFO, this option will have the same meaning as
+  // periodic_compaction_seconds. Whichever stricter will be used.
+  // 0 means disabling.
+  // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to
+  // pick default.
   //
-  // Default: 0 (disabled)
+  // Default: 30 days for leveled compaction + block based table. disable
+  //          otherwise.
   //
   // Dynamically changeable through SetOptions() API
-  uint64_t ttl = 0;
+  uint64_t ttl = 0xfffffffffffffffe;
 
   // Files older than this value will be picked up for compaction, and
   // re-written to the same level as they were before.
@@ -652,13 +672,26 @@ struct AdvancedColumnFamilyOptions {
   // age is based on the file's last modified time (given by the underlying
   // Env).
   //
-  // Only supported in Level compaction.
+  // Supported in Level and FIFO compaction.
+  // In FIFO compaction, this option has the same meaning as TTL and whichever
+  // stricter will be used.
   // Pre-req: max_open_file == -1.
   // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60
-  // Default: 0 (disabled)
+  //
+  // Values:
+  // 0: Turn off Periodic compactions.
+  // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature
+  //     as needed. For now, RocksDB will change this value to 30 days
+  //     (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction
+  //     process at least once every 30 days if not compacted sooner.
+  //     In FIFO compaction, since the option has the same meaning as ttl,
+  //     when this value is left default, and ttl is left to 0, 30 days will be
+  //     used. Otherwise, min(ttl, periodic_compaction_seconds) will be used.
+  //
+  // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune)
   //
   // Dynamically changeable through SetOptions() API
-  uint64_t periodic_compaction_seconds = 0;
+  uint64_t periodic_compaction_seconds = 0xfffffffffffffffe;
 
   // If this option is set then 1 in N blocks are compressed
   // using a fast (lz4) and slow (zstd) compression algorithm.
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index a0ae7ca7785..ba54085080a 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -138,6 +138,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only(
     const rocksdb_options_t* options, const char* name,
     unsigned char error_if_log_file_exist, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary(
+    const rocksdb_options_t* options, const char* name,
+    const char* secondary_path, char** errptr);
+
 extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
     const rocksdb_options_t* options, const char* path, char** errptr);
 
@@ -218,6 +222,13 @@ rocksdb_open_for_read_only_column_families(
     rocksdb_column_family_handle_t** column_family_handles,
     unsigned char error_if_log_file_exist, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary_column_families(
+    const rocksdb_options_t* options, const char* name,
+    const char* secondary_path, int num_column_families,
+    const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** colummn_family_handles, char** errptr);
+
 extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families(
     const rocksdb_options_t* options, const char* name, size_t* lencf,
     char** errptr);
@@ -816,8 +827,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
     rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
     rocksdb_options_t*, uint64_t);
-extern ROCKSDB_LIBRARY_API void rocksdb_options_set_snap_refresh_nanos(
-    rocksdb_options_t*, uint64_t);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_level_compaction_dynamic_level_bytes(rocksdb_options_t*,
                                                          unsigned char);
@@ -843,8 +852,13 @@ rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
                                                         int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*,
+                                                      int64_t);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
     rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_unordered_write(
+    rocksdb_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions(
     rocksdb_options_t*, uint32_t);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs(
@@ -1247,6 +1261,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown(
     rocksdb_writeoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri(
     rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_writeoptions_set_memtable_insert_hint_per_batch(rocksdb_writeoptions_t*,
+                                                        unsigned char);
 
 /* Compact range options */
 
@@ -1301,6 +1318,11 @@ extern ROCKSDB_LIBRARY_API void
 rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
 extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads(
     rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*);
 
 extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create();
@@ -1368,6 +1390,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_ingest_external_file_cf(
     const char* const* file_list, const size_t list_len,
     const rocksdb_ingestexternalfileoptions_t* opt, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_try_catch_up_with_primary(
+    rocksdb_t* db, char** errptr);
+
 /* SliceTransform */
 
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index ed7790aebb5..27b4a6f6432 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -36,6 +36,13 @@ class Cache;
 
 extern const bool kDefaultToAdaptiveMutex;
 
+enum CacheMetadataChargePolicy {
+  kDontChargeCacheMetadata,
+  kFullChargeCacheMetadata
+};
+const CacheMetadataChargePolicy kDefaultCacheMetadataChargePolicy =
+    kFullChargeCacheMetadata;
+
 struct LRUCacheOptions {
   // Capacity of the cache.
   size_t capacity = 0;
@@ -59,7 +66,7 @@ struct LRUCacheOptions {
   //
   // See also
   // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority.
-  double high_pri_pool_ratio = 0.0;
+  double high_pri_pool_ratio = 0.5;
 
   // If non-nullptr will use this allocator instead of system allocator when
   // allocating memory for cache blocks. Call this method before you start using
@@ -76,17 +83,23 @@ struct LRUCacheOptions {
   // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise.
   bool use_adaptive_mutex = kDefaultToAdaptiveMutex;
 
+  CacheMetadataChargePolicy metadata_charge_policy =
+      kDefaultCacheMetadataChargePolicy;
+
   LRUCacheOptions() {}
   LRUCacheOptions(size_t _capacity, int _num_shard_bits,
                   bool _strict_capacity_limit, double _high_pri_pool_ratio,
                   std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
-                  bool _use_adaptive_mutex = kDefaultToAdaptiveMutex)
+                  bool _use_adaptive_mutex = kDefaultToAdaptiveMutex,
+                  CacheMetadataChargePolicy _metadata_charge_policy =
+                      kDefaultCacheMetadataChargePolicy)
       : capacity(_capacity),
         num_shard_bits(_num_shard_bits),
         strict_capacity_limit(_strict_capacity_limit),
         high_pri_pool_ratio(_high_pri_pool_ratio),
         memory_allocator(std::move(_memory_allocator)),
-        use_adaptive_mutex(_use_adaptive_mutex) {}
+        use_adaptive_mutex(_use_adaptive_mutex),
+        metadata_charge_policy(_metadata_charge_policy) {}
 };
 
 // Create a new cache with a fixed size capacity. The cache is sharded
@@ -99,9 +112,11 @@ struct LRUCacheOptions {
 // will be at least 512KB and number of shard bits will not exceed 6.
 extern std::shared_ptr<Cache> NewLRUCache(
     size_t capacity, int num_shard_bits = -1,
-    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.0,
+    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5,
     std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
-    bool use_adaptive_mutex = kDefaultToAdaptiveMutex);
+    bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy);
 
 extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
 
@@ -110,10 +125,11 @@ extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
 // more detail.
 //
 // Return nullptr if it is not supported.
-extern std::shared_ptr<Cache> NewClockCache(size_t capacity,
-                                            int num_shard_bits = -1,
-                                            bool strict_capacity_limit = false);
-
+extern std::shared_ptr<Cache> NewClockCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false,
+    CacheMetadataChargePolicy metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy);
 class Cache {
  public:
   // Depending on implementation, cache entries with high priority could be less
@@ -122,6 +138,9 @@ class Cache {
 
   Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
       : memory_allocator_(std::move(allocator)) {}
+  // No copying allowed
+  Cache(const Cache&) = delete;
+  Cache& operator=(const Cache&) = delete;
 
   // Destroys all existing entries by calling the "deleter"
   // function that was passed via the Insert() function.
@@ -226,6 +245,9 @@ class Cache {
   // returns the memory size for the entries in use by the system
   virtual size_t GetPinnedUsage() const = 0;
 
+  // returns the charge for the specific entry in the cache.
+  virtual size_t GetCharge(Handle* handle) const = 0;
+
   // Call this on shutdown if you want to speed it up. Cache will disown
   // any underlying data and will not free it on delete. This call will leak
   // memory - call this only if you're shutting down the process.
@@ -247,18 +269,9 @@ class Cache {
 
   virtual std::string GetPrintableOptions() const { return ""; }
 
-  // Mark the last inserted object as being a raw data block. This will be used
-  // in tests. The default implementation does nothing.
-  virtual void TEST_mark_as_data_block(const Slice& /*key*/,
-                                       size_t /*charge*/) {}
-
   MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
 
  private:
-  // No copying allowed
-  Cache(const Cache&);
-  Cache& operator=(const Cache&);
-
   std::shared_ptr<MemoryAllocator> memory_allocator_;
 };
 
diff --git a/include/rocksdb/cleanable.h b/include/rocksdb/cleanable.h
index 6dba8d9531c..3a111d545e2 100644
--- a/include/rocksdb/cleanable.h
+++ b/include/rocksdb/cleanable.h
@@ -23,12 +23,12 @@ namespace rocksdb {
 class Cleanable {
  public:
   Cleanable();
-  ~Cleanable();
-
   // No copy constructor and copy assignment allowed.
   Cleanable(Cleanable&) = delete;
   Cleanable& operator=(Cleanable&) = delete;
 
+  ~Cleanable();
+
   // Move constructor and move assignment is allowed.
   Cleanable(Cleanable&&);
   Cleanable& operator=(Cleanable&&);
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
index 46279f9a693..e30a9d01459 100644
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -20,8 +20,22 @@ class Slice;
 // from multiple threads.
 class Comparator {
  public:
+  Comparator() : timestamp_size_(0) {}
+
+  Comparator(size_t ts_sz) : timestamp_size_(ts_sz) {}
+
+  Comparator(const Comparator& orig) : timestamp_size_(orig.timestamp_size_) {}
+
+  Comparator& operator=(const Comparator& rhs) {
+    if (this != &rhs) {
+      timestamp_size_ = rhs.timestamp_size_;
+    }
+    return *this;
+  }
+
   virtual ~Comparator() {}
 
+  static const char* Type() { return "Comparator"; }
   // Three-way comparison.  Returns value:
   //   < 0 iff "a" < "b",
   //   == 0 iff "a" == "b",
@@ -78,6 +92,20 @@ class Comparator {
   // The major use case is to determine if DataBlockHashIndex is compatible
   // with the customized comparator.
   virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
+
+  inline size_t timestamp_size() const { return timestamp_size_; }
+
+  virtual int CompareWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return Compare(a, b);
+  }
+
+  virtual int CompareTimestamp(const Slice& /*ts1*/,
+                               const Slice& /*ts2*/) const {
+    return 0;
+  }
+
+ private:
+  size_t timestamp_size_;
 };
 
 // Return a builtin comparator that uses lexicographic byte-wise
diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h
index d3cbe6016ac..db26948a432 100644
--- a/include/rocksdb/convenience.h
+++ b/include/rocksdb/convenience.h
@@ -339,6 +339,13 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
 Status VerifySstFileChecksum(const Options& options,
                              const EnvOptions& env_options,
                              const std::string& file_path);
+
+// Verify the checksum of file
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const ReadOptions& read_options,
+                             const std::string& file_path);
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 8bec4a56f94..bc93aeda164 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -59,6 +59,7 @@ class CompactionJobInfo;
 #endif
 
 extern const std::string kDefaultColumnFamilyName;
+extern const std::string kPersistentStatsColumnFamilyName;
 struct ColumnFamilyDescriptor {
   std::string name;
   ColumnFamilyOptions options;
@@ -115,6 +116,10 @@ struct IngestExternalFileArg {
   IngestExternalFileOptions options;
 };
 
+struct GetMergeOperandsOptions {
+  int expected_max_number_of_operands = 0;
+};
+
 // A collections of table properties objects, where
 //  key: is the table's file name.
 //  value: the table properties object of the given table.
@@ -232,9 +237,13 @@ class DB {
   // status in case there are any errors. This will not fsync the WAL files.
   // If syncing is required, the caller must first call SyncWAL(), or Write()
   // using an empty write batch with WriteOptions.sync=true.
-  // Regardless of the return status, the DB must be freed. If the return
-  // status is NotSupported(), then the DB implementation does cleanup in the
-  // destructor
+  // Regardless of the return status, the DB must be freed.
+  // If the return status is Aborted(), closing fails because there is
+  // unreleased snapshot in the system. In this case, users can release
+  // the unreleased snapshots and try again and expect it to succeed. For
+  // other status, recalling Close() will be no-op.
+  // If the return status is NotSupported(), then the DB implementation does
+  // cleanup in the destructor
   virtual Status Close() { return Status::NotSupported(); }
 
   // ListColumnFamilies will open the DB specified by argument name
@@ -246,6 +255,10 @@ class DB {
                                    std::vector<std::string>* column_families);
 
   DB() {}
+  // No copying allowed
+  DB(const DB&) = delete;
+  void operator=(const DB&) = delete;
+
   virtual ~DB();
 
   // Create a column_family and return the handle of column family
@@ -398,6 +411,22 @@ class DB {
     return Get(options, DefaultColumnFamily(), key, value);
   }
 
+  // Returns all the merge operands corresponding to the key. If the
+  // number of merge operands in DB is greater than
+  // merge_operands_options.expected_max_number_of_operands
+  // no merge operands are returned and status is Incomplete. Merge operands
+  // returned are in the order of insertion.
+  // merge_operands- Points to an array of at-least
+  //             merge_operands_options.expected_max_number_of_operands and the
+  //             caller is responsible for allocating it. If the status
+  //             returned is Incomplete then number_of_operands will contain
+  //             the total number of merge operands found in DB for key.
+  virtual Status GetMergeOperands(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      const Slice& key, PinnableSlice* merge_operands,
+      GetMergeOperandsOptions* get_merge_operands_options,
+      int* number_of_operands) = 0;
+
   // If keys[i] does not exist in the database, then the i'th returned
   // status will be one for which Status::IsNotFound() is true, and
   // (*values)[i] will be set to some arbitrary value (often ""). Otherwise,
@@ -461,6 +490,47 @@ class DB {
       values++;
     }
   }
+
+  // Overloaded MultiGet API that improves performance by batching operations
+  // in the read path for greater efficiency. Currently, only the block based
+  // table format with full filters are supported. Other table formats such
+  // as plain table, block based table with block based filters and
+  // partitioned indexes will still work, but will not get any performance
+  // benefits.
+  // Parameters -
+  // options - ReadOptions
+  // column_family - ColumnFamilyHandle* that the keys belong to. All the keys
+  //                 passed to the API are restricted to a single column family
+  // num_keys - Number of keys to lookup
+  // keys - Pointer to C style array of key Slices with num_keys elements
+  // values - Pointer to C style array of PinnableSlices with num_keys elements
+  // statuses - Pointer to C style array of Status with num_keys elements
+  // sorted_input - If true, it means the input keys are already sorted by key
+  //                order, so the MultiGet() API doesn't have to sort them
+  //                again. If false, the keys will be copied and sorted
+  //                internally by the API - the input array will not be
+  //                modified
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool /*sorted_input*/ = false) {
+    std::vector<ColumnFamilyHandle*> cf;
+    std::vector<Slice> user_keys;
+    std::vector<Status> status;
+    std::vector<std::string> vals;
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      cf.emplace_back(column_families[i]);
+      user_keys.emplace_back(keys[i]);
+    }
+    status = MultiGet(options, cf, user_keys, &vals);
+    std::copy(status.begin(), status.end(), statuses);
+    for (auto& value : vals) {
+      values->PinSelf(value);
+      values++;
+    }
+  }
+
   // If the key definitely does not exist in the database, then this method
   // returns false, else true. If the caller wants to obtain value when the key
   // is found in memory, a bool for 'value_found' must be passed. 'value_found'
@@ -803,7 +873,7 @@ class DB {
   // stats should be included, or file stats approximation or both
   enum SizeApproximationFlags : uint8_t {
     NONE = 0,
-    INCLUDE_MEMTABLES = 1,
+    INCLUDE_MEMTABLES = 1 << 0,
     INCLUDE_FILES = 1 << 1
   };
 
@@ -813,14 +883,24 @@ class DB {
   // Note that the returned sizes measure file system space usage, so
   // if the user data compresses by a factor of ten, the returned
   // sizes will be one-tenth the size of the corresponding user data size.
-  //
-  // If include_flags defines whether the returned size should include
-  // the recently written data in the mem-tables (if
-  // the mem-table type supports it), data serialized to disk, or both.
-  // include_flags should be of type DB::SizeApproximationFlags
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* range, int n,
+                                     uint64_t* sizes) = 0;
+
+  // Simpler versions of the GetApproximateSizes() method above.
+  // The include_flags argumenbt must of type DB::SizeApproximationFlags
+  // and can not be NONE.
   virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
                                    const Range* range, int n, uint64_t* sizes,
-                                   uint8_t include_flags = INCLUDE_FILES) = 0;
+                                   uint8_t include_flags = INCLUDE_FILES) {
+    SizeApproximationOptions options;
+    options.include_memtabtles =
+        (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
+    options.include_files =
+        (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
+    GetApproximateSizes(options, column_family, range, n, sizes);
+  }
   virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
                                    uint8_t include_flags = INCLUDE_FILES) {
     GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
@@ -960,6 +1040,9 @@ class DB {
   virtual Status EnableAutoCompaction(
       const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
 
+  virtual void DisableManualCompaction() = 0;
+  virtual void EnableManualCompaction() = 0;
+
   // Number of levels used for this DB.
   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
   virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
@@ -1088,6 +1171,28 @@ class DB {
   // Retrieve the sorted list of all wal files with earliest file first
   virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
 
+  // Retrieve information about the current wal file
+  //
+  // Note that the log might have rolled after this call in which case
+  // the current_log_file would not point to the current log file.
+  //
+  // Additionally, for the sake of optimization current_log_file->StartSequence
+  // would always be set to 0
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) = 0;
+
+  // Retrieves the creation time of the oldest file in the DB.
+  // This API only works if max_open_files = -1, if it is not then
+  // Status returned is Status::NotSupported()
+  // The file creation time is set using the env provided to the DB.
+  // If the DB was created from a very old release then its possible that
+  // the SST files might not have file_creation_time property and even after
+  // moving to a newer release its possible that some files never got compacted
+  // and may not have file_creation_time property. In both the cases
+  // file_creation_time is considered 0 which means this API will return
+  // creation_time = 0 as there wouldn't be a timestamp lower than 0.
+  virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) = 0;
+
   // Note: this API is not yet consistent with WritePrepared transactions.
   // Sets iter to an iterator that is positioned at a write-batch containing
   // seq_number. If the sequence number is non existent, it returns an iterator
@@ -1169,7 +1274,30 @@ class DB {
   virtual Status IngestExternalFiles(
       const std::vector<IngestExternalFileArg>& args) = 0;
 
-  virtual Status VerifyChecksum() = 0;
+  // CreateColumnFamilyWithImport() will create a new column family with
+  // column_family_name and import external SST files specified in metadata into
+  // this column family.
+  // (1) External SST files can be created using SstFileWriter.
+  // (2) External SST files can be exported from a particular column family in
+  //     an existing DB.
+  // Option in import_options specifies whether the external files are copied or
+  // moved (default is copy). When option specifies copy, managing files at
+  // external_file_path is caller's responsibility. When option specifies a
+  // move, the call ensures that the specified files at external_file_path are
+  // deleted on successful return and files are not modified on any error
+  // return.
+  // On error return, column family handle returned will be nullptr.
+  // ColumnFamily will be present on successful return and will not be present
+  // on error return. ColumnFamily may be present on any crash during this call.
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) = 0;
+
+  virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
+
+  virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
 
   // AddFile() is deprecated, please use IngestExternalFile()
   ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
@@ -1313,13 +1441,25 @@ class DB {
   virtual Status EndTrace() {
     return Status::NotSupported("EndTrace() is not implemented.");
   }
+
+  // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
+  virtual Status StartBlockCacheTrace(
+      const TraceOptions& /*options*/,
+      std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
+  }
+
+  virtual Status EndBlockCacheTrace() {
+    return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
+  }
 #endif  // ROCKSDB_LITE
 
   // Needed for StackableDB
   virtual DB* GetRootDB() { return this; }
 
-  // Given a time window, return an iterator for accessing stats history
-  // User is responsible for deleting StatsHistoryIterator after use
+  // Given a window [start_time, end_time), setup a StatsHistoryIterator
+  // to access stats history. Note the start_time and end_time are epoch
+  // time measured in seconds, and end_time is an exclusive bound.
   virtual Status GetStatsHistory(
       uint64_t /*start_time*/, uint64_t /*end_time*/,
       std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
@@ -1342,11 +1482,6 @@ class DB {
     return Status::NotSupported("Supported only by secondary instance");
   }
 #endif  // !ROCKSDB_LITE
-
- private:
-  // No copying allowed
-  DB(const DB&);
-  void operator=(const DB&);
 };
 
 // Destroy the contents of the specified database.
diff --git a/include/rocksdb/db_stress_tool.h b/include/rocksdb/db_stress_tool.h
new file mode 100644
index 00000000000..2ae54980e9a
--- /dev/null
+++ b/include/rocksdb/db_stress_tool.h
@@ -0,0 +1,9 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+namespace rocksdb {
+int db_stress_tool(int argc, char** argv);
+}  // namespace rocksdb
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 8f6bd607228..e70a49ffc43 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -41,6 +41,7 @@
 
 namespace rocksdb {
 
+class DynamicLibrary;
 class FileLock;
 class Logger;
 class RandomAccessFile;
@@ -117,10 +118,10 @@ struct EnvOptions {
   bool fallocate_with_keep_size = true;
 
   // See DBOptions doc
-  size_t compaction_readahead_size;
+  size_t compaction_readahead_size = 0;
 
   // See DBOptions doc
-  size_t random_access_max_buffer_size;
+  size_t random_access_max_buffer_size = 0;
 
   // See DBOptions doc
   size_t writable_file_max_buffer_size = 1024 * 1024;
@@ -140,9 +141,21 @@ class Env {
   };
 
   Env() : thread_status_updater_(nullptr) {}
+  // No copying allowed
+  Env(const Env&) = delete;
+  void operator=(const Env&) = delete;
 
   virtual ~Env();
 
+  static const char* Type() { return "Environment"; }
+
+  // Loads the environment specified by the input value into the result
+  static Status LoadEnv(const std::string& value, Env** result);
+
+  // Loads the environment specified by the input value into the result
+  static Status LoadEnv(const std::string& value, Env** result,
+                        std::shared_ptr<Env>* guard);
+
   // Return a default environment suitable for the current operating
   // system.  Sophisticated users may wish to provide their own Env
   // implementation instead of relying on this default environment.
@@ -338,6 +351,18 @@ class Env {
   // REQUIRES: lock has not already been unlocked.
   virtual Status UnlockFile(FileLock* lock) = 0;
 
+  // Opens `lib_name` as a dynamic library.
+  // If the 'search_path' is specified, breaks the path into its components
+  // based on the appropriate platform separator (";" or ";") and looks for the
+  // library in those directories.  If 'search path is not specified, uses the
+  // default library path search mechanism (such as LD_LIBRARY_PATH). On
+  // success, stores a dynamic library in `*result`.
+  virtual Status LoadLibrary(const std::string& /*lib_name*/,
+                             const std::string& /*search_path */,
+                             std::shared_ptr<DynamicLibrary>* /*result*/) {
+    return Status::NotSupported("LoadLibrary is not implemented in this Env");
+  }
+
   // Priority for scheduling job in thread pool
   enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL };
 
@@ -382,9 +407,11 @@ class Env {
   // same directory.
   virtual Status GetTestDirectory(std::string* path) = 0;
 
-  // Create and return a log file for storing informational messages.
+  // Create and returns a default logger (an instance of EnvLogger) for storing
+  // informational messages. Derived classes can overide to provide custom
+  // logger.
   virtual Status NewLogger(const std::string& fname,
-                           std::shared_ptr<Logger>* result) = 0;
+                           std::shared_ptr<Logger>* result);
 
   // Returns the number of micro-seconds since some fixed point in time.
   // It is often used as system time such as in GenericRateLimiter
@@ -501,17 +528,14 @@ class Env {
     return Status::NotSupported();
   }
 
+  virtual void SanitizeEnvOptions(EnvOptions* /*env_opts*/) const {}
+
   // If you're adding methods here, remember to add them to EnvWrapper too.
 
  protected:
   // The pointer to an internal structure that will update the
   // status of each thread.
   ThreadStatusUpdater* thread_status_updater_;
-
- private:
-  // No copying allowed
-  Env(const Env&);
-  void operator=(const Env&);
 };
 
 // The factory function to construct a ThreadStatusUpdater.  Any Env
@@ -570,6 +594,26 @@ class SequentialFile {
   // SequentialFileWrapper too.
 };
 
+// A read IO request structure for use in MultiRead
+struct ReadRequest {
+  // File offset in bytes
+  uint64_t offset;
+
+  // Length to read in bytes
+  size_t len;
+
+  // A buffer that MultiRead()  can optionally place data in. It can
+  // ignore this and allocate its own buffer
+  char* scratch;
+
+  // Output parameter set by MultiRead() to point to the data buffer, and
+  // the number of valid bytes
+  Slice result;
+
+  // Status of read
+  Status status;
+};
+
 // A file abstraction for randomly reading the contents of a file.
 class RandomAccessFile {
  public:
@@ -594,6 +638,22 @@ class RandomAccessFile {
     return Status::OK();
   }
 
+  // Read a bunch of blocks as described by reqs. The blocks can
+  // optionally be read in parallel. This is a synchronous call, i.e it
+  // should return after all reads have completed. The reads will be
+  // non-overlapping. If the function return Status is not ok, status of
+  // individual requests will be ignored and return status will be assumed
+  // for all read requests. The function return status is only meant for any
+  // any errors that occur before even processing specific read requests
+  virtual Status MultiRead(ReadRequest* reqs, size_t num_reqs) {
+    assert(reqs != nullptr);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      ReadRequest& req = reqs[i];
+      req.status = Read(req.offset, req.len, &req.result, req.scratch);
+    }
+    return Status::OK();
+  }
+
   // Tries to get an unique ID for this file that will be the same each time
   // the file is opened (and will stay the same while the file is open).
   // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
@@ -655,6 +715,9 @@ class WritableFile {
         io_priority_(Env::IO_TOTAL),
         write_hint_(Env::WLTH_NOT_SET),
         strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
+  // No copying allowed
+  WritableFile(const WritableFile&) = delete;
+  void operator=(const WritableFile&) = delete;
 
   virtual ~WritableFile();
 
@@ -814,9 +877,6 @@ class WritableFile {
  private:
   size_t last_preallocated_block_;
   size_t preallocation_block_size_;
-  // No copying allowed
-  WritableFile(const WritableFile&);
-  void operator=(const WritableFile&);
 
  protected:
   Env::IOPriority io_priority_;
@@ -828,6 +888,10 @@ class WritableFile {
 class RandomRWFile {
  public:
   RandomRWFile() {}
+  // No copying allowed
+  RandomRWFile(const RandomRWFile&) = delete;
+  RandomRWFile& operator=(const RandomRWFile&) = delete;
+
   virtual ~RandomRWFile() {}
 
   // Indicates if the class makes use of direct I/O
@@ -858,10 +922,6 @@ class RandomRWFile {
 
   // If you're adding methods here, remember to add them to
   // RandomRWFileWrapper too.
-
-  // No copying allowed
-  RandomRWFile(const RandomRWFile&) = delete;
-  RandomRWFile& operator=(const RandomRWFile&) = delete;
 };
 
 // MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
@@ -919,6 +979,10 @@ class Logger {
 
   explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
       : closed_(false), log_level_(log_level) {}
+  // No copying allowed
+  Logger(const Logger&) = delete;
+  void operator=(const Logger&) = delete;
+
   virtual ~Logger();
 
   // Close the log file. Must be called before destructor. If the return
@@ -960,9 +1024,6 @@ class Logger {
   bool closed_;
 
  private:
-  // No copying allowed
-  Logger(const Logger&);
-  void operator=(const Logger&);
   InfoLogLevel log_level_;
 };
 
@@ -974,8 +1035,29 @@ class FileLock {
 
  private:
   // No copying allowed
-  FileLock(const FileLock&);
-  void operator=(const FileLock&);
+  FileLock(const FileLock&) = delete;
+  void operator=(const FileLock&) = delete;
+};
+
+class DynamicLibrary {
+ public:
+  virtual ~DynamicLibrary() {}
+
+  // Returns the name of the dynamic library.
+  virtual const char* Name() const = 0;
+
+  // Loads the symbol for sym_name from the library and updates the input
+  // function. Returns the loaded symbol.
+  template <typename T>
+  Status LoadFunction(const std::string& sym_name, std::function<T>* function) {
+    assert(nullptr != function);
+    void* ptr = nullptr;
+    Status s = LoadSymbol(sym_name, &ptr);
+    *function = reinterpret_cast<T*>(ptr);
+    return s;
+  }
+  // Loads and returns the symbol for sym_name from the library.
+  virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0;
 };
 
 extern void LogFlush(const std::shared_ptr<Logger>& info_log);
@@ -1168,6 +1250,12 @@ class EnvWrapper : public Env {
 
   Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); }
 
+  Status LoadLibrary(const std::string& lib_name,
+                     const std::string& search_path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    return target_->LoadLibrary(lib_name, search_path, result);
+  }
+
   void Schedule(void (*f)(void* arg), void* a, Priority pri,
                 void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
     return target_->Schedule(f, a, pri, tag, u);
@@ -1277,6 +1365,9 @@ class EnvWrapper : public Env {
   Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
     return target_->GetFreeSpace(path, diskfree);
   }
+  void SanitizeEnvOptions(EnvOptions* env_opts) const override {
+    target_->SanitizeEnvOptions(env_opts);
+  }
 
  private:
   Env* target_;
@@ -1315,6 +1406,9 @@ class RandomAccessFileWrapper : public RandomAccessFile {
               char* scratch) const override {
     return target_->Read(offset, n, result, scratch);
   }
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+    return target_->MultiRead(reqs, num_reqs);
+  }
   Status Prefetch(uint64_t offset, size_t n) override {
     return target_->Prefetch(offset, n);
   }
@@ -1484,4 +1578,10 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname);
 // This is a factory method for TimedEnv defined in utilities/env_timed.cc.
 Env* NewTimedEnv(Env* base_env);
 
+// Returns an instance of logger that can be used for storing informational
+// messages.
+// This is a factory method for EnvLogger declared in logging/env_logging.h
+Status NewEnvLogger(const std::string& fname, Env* env,
+                    std::shared_ptr<Logger>* result);
+
 }  // namespace rocksdb
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index 950fbe616ea..ad6862d5f38 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -25,9 +25,12 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/advanced_options.h"
+
 namespace rocksdb {
 
 class Slice;
+struct BlockBasedTableOptions;
 
 // A class that takes a bunch of keys, then generates filter
 class FilterBitsBuilder {
@@ -44,12 +47,13 @@ class FilterBitsBuilder {
   // The ownership of actual data is set to buf
   virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
 
-  // Calculate num of entries fit into a space.
+  // Calculate num of keys that can be added and generate a filter
+  // <= the specified number of bytes.
 #if defined(_MSC_VER)
 #pragma warning(push)
 #pragma warning(disable : 4702)  // unreachable code
 #endif
-  virtual int CalculateNumEntry(const uint32_t /*space*/) {
+  virtual int CalculateNumEntry(const uint32_t /*bytes*/) {
 #ifndef ROCKSDB_LITE
     throw std::runtime_error("CalculateNumEntry not Implemented");
 #else
@@ -79,6 +83,26 @@ class FilterBitsReader {
   }
 };
 
+// Contextual information passed to BloomFilterPolicy at filter building time.
+// Used in overriding FilterPolicy::GetBuilderWithContext().
+struct FilterBuildingContext {
+  // This constructor is for internal use only and subject to change.
+  FilterBuildingContext(const BlockBasedTableOptions& table_options);
+
+  // Options for the table being built
+  const BlockBasedTableOptions& table_options;
+
+  // Name of the column family for the table (or empty string if unknown)
+  std::string column_family_name;
+
+  // The compactions style in effect for the table
+  CompactionStyle compaction_style = kCompactionStyleLevel;
+
+  // The table level at time of constructing the SST file, or -1 if unknown.
+  // (The table file could later be used at a different level.)
+  int level_at_creation = -1;
+};
+
 // We add a new format of filter block called full filter block
 // This new interface gives you more space of customization
 //
@@ -119,13 +143,26 @@ class FilterPolicy {
   // list, but it should aim to return false with a high probability.
   virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
 
-  // Get the FilterBitsBuilder, which is ONLY used for full filter block
-  // It contains interface to take individual key, then generate filter
+  // Return a new FilterBitsBuilder for full or partitioned filter blocks, or
+  // nullptr if using block-based filter.
+  // NOTE: This function is only called by GetBuilderWithContext() below for
+  // custom FilterPolicy implementations. Thus, it is not necessary to
+  // override this function if overriding GetBuilderWithContext().
   virtual FilterBitsBuilder* GetFilterBitsBuilder() const { return nullptr; }
 
-  // Get the FilterBitsReader, which is ONLY used for full filter block
-  // It contains interface to tell if key can be in filter
-  // The input slice should NOT be deleted by FilterPolicy
+  // A newer variant of GetFilterBitsBuilder that allows a FilterPolicy
+  // to customize the builder for contextual constraints and hints.
+  // (Name changed to avoid triggering -Werror=overloaded-virtual.)
+  // If overriding GetFilterBitsBuilder() suffices, it is not necessary to
+  // override this function.
+  virtual FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const {
+    return GetFilterBitsBuilder();
+  }
+
+  // Return a new FilterBitsReader for full or partitioned filter blocks, or
+  // nullptr if using block-based filter.
+  // As here, the input slice should NOT be deleted by FilterPolicy.
   virtual FilterBitsReader* GetFilterBitsReader(
       const Slice& /*contents*/) const {
     return nullptr;
@@ -135,10 +172,14 @@ class FilterPolicy {
 // Return a new filter policy that uses a bloom filter with approximately
 // the specified number of bits per key.
 //
-// bits_per_key: bits per key in bloom filter. A good value for bits_per_key
-// is 10, which yields a filter with ~ 1% false positive rate.
-// use_block_based_builder: use block based filter rather than full filter.
-// If you want to builder full filter, it needs to be set to false.
+// bits_per_key: average bits allocated per key in bloom filter. A good
+// choice is 9.9, which yields a filter with ~ 1% false positive rate.
+// When format_version < 5, the value will be rounded to the nearest
+// integer. Recommend using no more than three decimal digits after the
+// decimal point, as in 6.667.
+//
+// use_block_based_builder: use deprecated block based filter (true) rather
+// than full or partitioned filter (false).
 //
 // Callers must delete the result after any database that is using the
 // result has been closed.
@@ -151,5 +192,5 @@ class FilterPolicy {
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
 extern const FilterPolicy* NewBloomFilterPolicy(
-    int bits_per_key, bool use_block_based_builder = false);
+    double bits_per_key, bool use_block_based_builder = false);
 }  // namespace rocksdb
diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
index e99b434a019..162e262e328 100644
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -28,6 +28,10 @@ namespace rocksdb {
 class Iterator : public Cleanable {
  public:
   Iterator() {}
+  // No copying allowed
+  Iterator(const Iterator&) = delete;
+  void operator=(const Iterator&) = delete;
+
   virtual ~Iterator() {}
 
   // An iterator is either positioned at a key/value pair, or
@@ -104,11 +108,6 @@ class Iterator : public Cleanable {
   //   Get the user-key portion of the internal key at which the iteration
   //   stopped.
   virtual Status GetProperty(std::string prop_name, std::string* prop);
-
- private:
-  // No copying allowed
-  Iterator(const Iterator&);
-  void operator=(const Iterator&);
 };
 
 // Return an empty iterator (yields nothing).
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 5be55cbede8..57bb1eeb0d4 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -170,6 +170,10 @@ struct FlushJobInfo {
   std::string cf_name;
   // the path to the newly created file
   std::string file_path;
+  // the file number of the newly created file
+  uint64_t file_number;
+  // the oldest blob file referenced by the newly created file
+  uint64_t oldest_blob_file_number;
   // the id of the thread that completed this flush job.
   uint64_t thread_id;
   // the job id, which is unique in the same thread.
@@ -194,11 +198,18 @@ struct FlushJobInfo {
   FlushReason flush_reason;
 };
 
-struct CompactionJobInfo {
-  CompactionJobInfo() = default;
-  explicit CompactionJobInfo(const CompactionJobStats& _stats)
-      : stats(_stats) {}
+struct CompactionFileInfo {
+  // The level of the file.
+  int level;
+
+  // The file number of the file.
+  uint64_t file_number;
 
+  // The file number of the oldest blob file this SST file references.
+  uint64_t oldest_blob_file_number;
+};
+
+struct CompactionJobInfo {
   // the id of the column family where the compaction happened.
   uint32_t cf_id;
   // the name of the column family where the compaction happened.
@@ -213,11 +224,25 @@ struct CompactionJobInfo {
   int base_input_level;
   // the output level of the compaction.
   int output_level;
-  // the names of the compaction input files.
+
+  // The following variables contain information about compaction inputs
+  // and outputs. A file may appear in both the input and output lists
+  // if it was simply moved to a different level. The order of elements
+  // is the same across input_files and input_file_infos; similarly, it is
+  // the same across output_files and output_file_infos.
+
+  // The names of the compaction input files.
   std::vector<std::string> input_files;
 
-  // the names of the compaction output files.
+  // Additional information about the compaction input files.
+  std::vector<CompactionFileInfo> input_file_infos;
+
+  // The names of the compaction output files.
   std::vector<std::string> output_files;
+
+  // Additional information about the compaction output files.
+  std::vector<CompactionFileInfo> output_file_infos;
+
   // Table properties for input and output tables.
   // The map is keyed by values from input_files and output_files.
   TablePropertiesCollection table_properties;
@@ -459,6 +484,7 @@ class EventListener {
 #else
 
 class EventListener {};
+struct FlushJobInfo {};
 
 #endif  // ROCKSDB_LITE
 
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 328422f5703..7f18a581e97 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -120,6 +120,28 @@ class MemTableRep {
     return true;
   }
 
+  // Same as ::InsertWithHint, but allow concurrnet write
+  //
+  // If hint points to nullptr, a new hint will be allocated on heap, otherwise
+  // the hint will be updated to reflect the last insert location. The hint is
+  // owned by the caller and it is the caller's responsibility to delete the
+  // hint later.
+  //
+  // Currently only skip-list based memtable implement the interface. Other
+  // implementations will fallback to InsertConcurrently() by default.
+  virtual void InsertWithHintConcurrently(KeyHandle handle, void** /*hint*/) {
+    // Ignore the hint by default.
+    InsertConcurrently(handle);
+  }
+
+  // Same as ::InsertWithHintConcurrently
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) {
+    InsertWithHintConcurrently(handle, hint);
+    return true;
+  }
+
   // Like Insert(handle), but may be called concurrent with other calls
   // to InsertConcurrently for other handles.
   //
diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h
index d8ddcc6a097..36f47e254ed 100644
--- a/include/rocksdb/merge_operator.h
+++ b/include/rocksdb/merge_operator.h
@@ -46,6 +46,7 @@ class Logger;
 class MergeOperator {
  public:
   virtual ~MergeOperator() {}
+  static const char* Type() { return "MergeOperator"; }
 
   // Gives the client a way to express the read -> modify -> write semantics
   // key:      (IN)    The key that's associated with this merge operation.
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index a0ab41efdfb..fecee84304d 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -55,25 +55,25 @@ struct LevelMetaData {
 struct SstFileMetaData {
   SstFileMetaData()
       : size(0),
-        name(""),
-        db_path(""),
+        file_number(0),
         smallest_seqno(0),
         largest_seqno(0),
-        smallestkey(""),
-        largestkey(""),
         num_reads_sampled(0),
         being_compacted(false),
         num_entries(0),
-        num_deletions(0) {}
+        num_deletions(0),
+        oldest_blob_file_number(0) {}
 
-  SstFileMetaData(const std::string& _file_name, const std::string& _path,
-                  size_t _size, SequenceNumber _smallest_seqno,
-                  SequenceNumber _largest_seqno,
+  SstFileMetaData(const std::string& _file_name, uint64_t _file_number,
+                  const std::string& _path, size_t _size,
+                  SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
                   const std::string& _smallestkey,
                   const std::string& _largestkey, uint64_t _num_reads_sampled,
-                  bool _being_compacted)
+                  bool _being_compacted, uint64_t _oldest_blob_file_number,
+                  uint64_t _oldest_ancester_time, uint64_t _file_creation_time)
       : size(_size),
         name(_file_name),
+        file_number(_file_number),
         db_path(_path),
         smallest_seqno(_smallest_seqno),
         largest_seqno(_largest_seqno),
@@ -82,12 +82,17 @@ struct SstFileMetaData {
         num_reads_sampled(_num_reads_sampled),
         being_compacted(_being_compacted),
         num_entries(0),
-        num_deletions(0) {}
+        num_deletions(0),
+        oldest_blob_file_number(_oldest_blob_file_number),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time) {}
 
   // File size in bytes.
   size_t size;
   // The name of the file.
   std::string name;
+  // The id of the file.
+  uint64_t file_number;
   // The full path where the file locates.
   std::string db_path;
 
@@ -100,6 +105,18 @@ struct SstFileMetaData {
 
   uint64_t num_entries;
   uint64_t num_deletions;
+
+  uint64_t oldest_blob_file_number;  // The id of the oldest blob file
+                                     // referenced by the file.
+  // An SST file may be generated by compactions whose input files may
+  // in turn be generated by earlier compactions. The creation time of the
+  // oldest SST file that is the compaction ancester of this file.
+  // The timestamp is provided Env::GetCurrentTime().
+  // 0 if the information is not available.
+  uint64_t oldest_ancester_time;
+  // Timestamp when the SST file is created, provided by Env::GetCurrentTime().
+  // 0 if the information is not available.
+  uint64_t file_creation_time;
 };
 
 // The full set of metadata associated with each SST file.
@@ -108,4 +125,11 @@ struct LiveFileMetaData : SstFileMetaData {
   int level;                       // Level at which this file resides.
   LiveFileMetaData() : column_family_name(), level(0) {}
 };
+
+// Metadata returned as output from ExportColumnFamily() and used as input to
+// CreateColumnFamiliesWithImport().
+struct ExportImportFilesMetaData {
+  std::string db_comparator_name;       // Used to safety check at import.
+  std::vector<LiveFileMetaData> files;  // Vector of file metadata.
+};
 }  // namespace rocksdb
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index ab856bee8e1..624aa524573 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -269,16 +269,8 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   uint64_t max_bytes_for_level_base = 256 * 1048576;
 
-  // If non-zero, compactions will periodically refresh the snapshot list. The
-  // delay for the first refresh is snap_refresh_nanos nano seconds and
-  // exponentially increases afterwards. When having many short-lived snapshots,
-  // this option helps reducing the cpu usage of long-running compactions. The
-  // feature is disabled when max_subcompactions is greater than one.
-  //
-  // Default: 0.5s
-  //
-  // Dynamically changeable through SetOptions() API
-  uint64_t snap_refresh_nanos = 500 * 1000 * 1000;  // 0.5s
+  // Deprecated.
+  uint64_t snap_refresh_nanos = 0;
 
   // Disable automatic compactions. Manual compactions can still
   // be issued on this column family
@@ -694,6 +686,18 @@ struct DBOptions {
   // Default: 600
   unsigned int stats_persist_period_sec = 600;
 
+  // If true, automatically persist stats to a hidden column family (column
+  // family name: ___rocksdb_stats_history___) every
+  // stats_persist_period_sec seconds; otherwise, write to an in-memory
+  // struct. User can query through `GetStatsHistory` API.
+  // If user attempts to create a column family with the same name on a DB
+  // which have previously set persist_stats_to_disk to true, the column family
+  // creation will fail, but the hidden column family will survive, as well as
+  // the previously persisted statistics.
+  // When peristing stats to disk, the stat name will be limited at 100 bytes.
+  // Default: false
+  bool persist_stats_to_disk = false;
+
   // if not zero, periodically take stats snapshots and store in memory, the
   // memory size for stats snapshots is capped at stats_history_buffer_size
   // Default: 1MB
@@ -748,6 +752,8 @@ struct DBOptions {
   // for this mode if using block-based table.
   //
   // Default: false
+  // This flag has no affect on the behavior of compaction and plan to delete
+  // in the future.
   bool new_table_reader_for_compaction_inputs = false;
 
   // If non-zero, we perform bigger reads when doing compaction. If you're
@@ -893,6 +899,32 @@ struct DBOptions {
   // Default: false
   bool enable_pipelined_write = false;
 
+  // Setting unordered_write to true trades higher write throughput with
+  // relaxing the immutability guarantee of snapshots. This violates the
+  // repeatability one expects from ::Get from a snapshot, as well as
+  // ::MultiGet and Iterator's consistent-point-in-time view property.
+  // If the application cannot tolerate the relaxed guarantees, it can implement
+  // its own mechanisms to work around that and yet benefit from the higher
+  // throughput. Using TransactionDB with WRITE_PREPARED write policy and
+  // two_write_queues=true is one way to achieve immutable snapshots despite
+  // unordered_write.
+  //
+  // By default, i.e., when it is false, rocksdb does not advance the sequence
+  // number for new snapshots unless all the writes with lower sequence numbers
+  // are already finished. This provides the immutability that we except from
+  // snapshots. Moreover, since Iterator and MultiGet internally depend on
+  // snapshots, the snapshot immutability results into Iterator and MultiGet
+  // offering consistent-point-in-time view. If set to true, although
+  // Read-Your-Own-Write property is still provided, the snapshot immutability
+  // property is relaxed: the writes issued after the snapshot is obtained (with
+  // larger sequence numbers) will be still not visible to the reads from that
+  // snapshot, however, there still might be pending writes (with lower sequence
+  // number) that will change the state visible to the snapshot after they are
+  // landed to the memtable.
+  //
+  // Default: false
+  bool unordered_write = false;
+
   // If true, allow multi-writers to update mem tables in parallel.
   // Only some memtable_factory-s support concurrent writes; currently it
   // is implemented only for SkipListFactory.  Concurrent memtable writes
@@ -911,6 +943,13 @@ struct DBOptions {
   // Default: true
   bool enable_write_thread_adaptive_yield = true;
 
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  //
+  // Default: 1 MB
+  uint64_t max_write_batch_group_size_bytes = 1 << 20;
+
   // The maximum number of microseconds that a write operation will use
   // a yielding spin loop to coordinate with other write threads before
   // blocking on a mutex.  (Assuming write_thread_slow_yield_usec is
@@ -1047,6 +1086,24 @@ struct DBOptions {
   // If set to true, takes precedence over
   // ReadOptions::background_purge_on_iterator_cleanup.
   bool avoid_unnecessary_blocking_io = false;
+
+  // Historically DB ID has always been stored in Identity File in DB folder.
+  // If this flag is true, the DB ID is written to Manifest file in addition
+  // to the Identity file. By doing this 2 problems are solved
+  // 1. We don't checksum the Identity file where as Manifest file is.
+  // 2. Since the source of truth for DB is Manifest file DB ID will sit with
+  //    the source of truth. Previously the Identity file could be copied
+  //    independent of Manifest and that can result in wrong DB ID.
+  // We recommend setting this flag to true.
+  // Default: false
+  bool write_dbid_to_manifest = false;
+
+  // The number of bytes to prefetch when reading the log. This is mostly useful
+  // for reading a remotely located log, as it can save the number of
+  // round-trips. If 0, then the prefetching is disabled.
+  //
+  // Default: 0
+  size_t log_readahead_size = 0;
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
@@ -1229,6 +1286,14 @@ struct ReadOptions {
   // Default: 0 (don't filter by seqnum, return user keys)
   SequenceNumber iter_start_seqnum;
 
+  // Timestamp of operation. Read should return the latest data visible to the
+  // specified timestamp. All timestamps of the same database must be of the
+  // same length and format. The user is responsible for providing a customized
+  // compare function via Comparator to order <key, timestamp> tuples.
+  // The user-specified timestamp feature is still under active development,
+  // and the API is subject to change.
+  const Slice* timestamp;
+
   ReadOptions();
   ReadOptions(bool cksum, bool cache);
 };
@@ -1281,12 +1346,34 @@ struct WriteOptions {
   // Default: false
   bool low_pri;
 
+  // If true, this writebatch will maintain the last insert positions of each
+  // memtable as hints in concurrent write. It can improve write performance
+  // in concurrent writes if keys in one writebatch are sequential. In
+  // non-concurrent writes (when concurrent_memtable_writes is false) this
+  // option will be ignored.
+  //
+  // Default: false
+  bool memtable_insert_hint_per_batch;
+
+  // Timestamp of write operation, e.g. Put. All timestamps of the same
+  // database must share the same length and format. The user is also
+  // responsible for providing a customized compare function via Comparator to
+  // order <key, timestamp> tuples. If the user wants to enable timestamp, then
+  // all write operations must be associated with timestamp because RocksDB, as
+  // a single-node storage engine currently has no knowledge of global time,
+  // thus has to rely on the application.
+  // The user-specified timestamp feature is still under active development,
+  // and the API is subject to change.
+  const Slice* timestamp;
+
   WriteOptions()
       : sync(false),
         disableWAL(false),
         ignore_missing_column_families(false),
         no_slowdown(false),
-        low_pri(false) {}
+        low_pri(false),
+        memtable_insert_hint_per_batch(false),
+        timestamp(nullptr) {}
 };
 
 // Options that control flush operations
@@ -1372,6 +1459,8 @@ struct CompactRangeOptions {
 struct IngestExternalFileOptions {
   // Can be set to true to move the files instead of copying them.
   bool move_files = false;
+  // If set to true, ingestion falls back to copy when move fails.
+  bool failed_move_fall_back_to_copy = true;
   // If set to false, an ingested file keys could appear in existing snapshots
   // that where created before the file was ingested.
   bool snapshot_consistency = true;
@@ -1406,6 +1495,13 @@ struct IngestExternalFileOptions {
   // Warning: setting this to true causes slowdown in file ingestion because
   // the external SST file has to be read.
   bool verify_checksums_before_ingest = false;
+  // When verify_checksums_before_ingest = true, RocksDB uses default
+  // readahead setting to scan the file while verifying checksums before
+  // ingestion.
+  // Users can override the default value using this option.
+  // Using a large readahead size (> 2MB) can typically improve the performance
+  // of forward iteration on spinning disks.
+  size_t verify_checksums_readahead_size = 0;
 };
 
 enum TraceFilterType : uint64_t {
@@ -1429,4 +1525,30 @@ struct TraceOptions {
   uint64_t filter = kTraceFilterNone;
 };
 
+// ImportColumnFamilyOptions is used by ImportColumnFamily()
+struct ImportColumnFamilyOptions {
+  // Can be set to true to move the files instead of copying them.
+  bool move_files = false;
+};
+
+// Options used with DB::GetApproximateSizes()
+struct SizeApproximationOptions {
+  // Defines whether the returned size should include the recently written
+  // data in the mem-tables. If set to false, include_files must be true.
+  bool include_memtabtles = false;
+  // Defines whether the returned size should include data serialized to disk.
+  // If set to false, include_memtabtles must be true.
+  bool include_files = true;
+  // When approximating the files total size that is used to store a keys range
+  // using DB::GetApproximateSizes, allow approximation with an error margin of
+  // up to total_files_size * files_size_error_margin. This allows to take some
+  // shortcuts in files size approximation, resulting in better performance,
+  // while guaranteeing the resulting error is within a reasonable margin.
+  // E.g., if the value is 0.1, then the error margin of the returned files size
+  // approximation will be within 10%.
+  // If the value is non-positive - a more precise yet more CPU intensive
+  // estimation is performed.
+  double files_size_error_margin = -1.0;
+};
+
 }  // namespace rocksdb
diff --git a/include/rocksdb/sst_file_reader.h b/include/rocksdb/sst_file_reader.h
index 517907dd501..522a8d9a1df 100644
--- a/include/rocksdb/sst_file_reader.h
+++ b/include/rocksdb/sst_file_reader.h
@@ -33,7 +33,9 @@ class SstFileReader {
   std::shared_ptr<const TableProperties> GetTableProperties() const;
 
   // Verifies whether there is corruption in this table.
-  Status VerifyChecksum();
+  Status VerifyChecksum(const ReadOptions& /*read_options*/);
+
+  Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
 
  private:
   struct Rep;
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 3b2b2e048c7..b6b78ef99a3 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -324,6 +324,8 @@ enum Tickers : uint32_t {
   TXN_DUPLICATE_KEY_OVERHEAD,
   // # of times snapshot_mutex_ is acquired in the fast path.
   TXN_SNAPSHOT_MUTEX_OVERHEAD,
+  // # of times ::Get returned TryAgain due to expired snapshot seq
+  TXN_GET_TRY_AGAIN,
 
   // Number of keys actually found in MultiGet calls (vs number requested by
   // caller)
@@ -447,6 +449,10 @@ struct HistogramData {
   double min = 0.0;
 };
 
+// StatsLevel can be used to reduce statistics overhead by skipping certain
+// types of stats in the stats collection process.
+// Usage:
+//   options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
 enum StatsLevel : uint8_t {
   // Disable timer stats, and skip histogram stats
   kExceptHistogramOrTimers,
@@ -464,11 +470,19 @@ enum StatsLevel : uint8_t {
   kAll,
 };
 
-// Analyze the performance of a db
+// Analyze the performance of a db by providing cumulative stats over time.
+// Usage:
+//  Options options;
+//  options.statistics = rocksdb::CreateDBStatistics();
+//  Status s = DB::Open(options, kDBPath, &db);
+//  ...
+//  options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+//  HistogramData hist;
+//  options.statistics->histogramData(FLUSH_TIME, &hist);
 class Statistics {
  public:
   virtual ~Statistics() {}
-
+  static const char* Type() { return "Statistics"; }
   virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
   virtual void histogramData(uint32_t type,
                              HistogramData* const data) const = 0;
diff --git a/include/rocksdb/stats_history.h b/include/rocksdb/stats_history.h
index 40ea51d1ff0..c6634ae68aa 100644
--- a/include/rocksdb/stats_history.h
+++ b/include/rocksdb/stats_history.h
@@ -11,7 +11,6 @@
 #include <map>
 #include <string>
 
-// #include "db/db_impl.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 
@@ -19,6 +18,25 @@ namespace rocksdb {
 
 class DBImpl;
 
+// StatsHistoryIterator is the main interface for users to programmatically
+// access statistics snapshots that was automatically stored by RocksDB.
+// Depending on options, the stats can be in memory or on disk.
+// The stats snapshots are indexed by time that they were recorded, and each
+// stats snapshot contains individual stat name and value at the time of
+// recording.
+// Example:
+//   std::unique_ptr<StatsHistoryIterator> stats_iter;
+//   Status s = db->GetStatsHistory(0 /* start_time */,
+//                                  env->NowMicros() /* end_time*/,
+//                                  &stats_iter);
+//   if (s.ok) {
+//     for (; stats_iter->Valid(); stats_iter->Next()) {
+//       uint64_t stats_time = stats_iter->GetStatsTime();
+//       const std::map<std::string, uint64_t>& stats_map =
+//           stats_iter->GetStatsMap();
+//       process(stats_time, stats_map);
+//     }
+//   }
 class StatsHistoryIterator {
  public:
   StatsHistoryIterator() {}
@@ -31,10 +49,12 @@ class StatsHistoryIterator {
   // REQUIRES: Valid()
   virtual void Next() = 0;
 
-  // Return the time stamp (in microseconds) when stats history is recorded.
+  // Return the time stamp (in seconds) when stats history is recorded.
   // REQUIRES: Valid()
   virtual uint64_t GetStatsTime() const = 0;
 
+  virtual int GetFormatVersion() const { return -1; }
+
   // Return the current stats history as an std::map which specifies the
   // mapping from stats name to stats value . The underlying storage
   // for the returned map is valid only until the next modification of
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index 12e8070d1e8..507d04168e2 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -58,7 +58,9 @@ class Status {
     kBusy = 11,
     kExpired = 12,
     kTryAgain = 13,
-    kCompactionTooLarge = 14
+    kCompactionTooLarge = 14,
+    kColumnFamilyDropped = 15,
+    kMaxCode
   };
 
   Code code() const { return code_; }
@@ -74,6 +76,8 @@ class Status {
     kMemoryLimit = 7,
     kSpaceLimit = 8,
     kPathNotFound = 9,
+    KMergeOperandsInsufficientCapacity = 10,
+    kManualCompactionPaused = 11,
     kMaxSubCode
   };
 
@@ -184,6 +188,15 @@ class Status {
     return Status(kCompactionTooLarge, msg, msg2);
   }
 
+  static Status ColumnFamilyDropped(SubCode msg = kNone) {
+    return Status(kColumnFamilyDropped, msg);
+  }
+
+  static Status ColumnFamilyDropped(const Slice& msg,
+                                    const Slice& msg2 = Slice()) {
+    return Status(kColumnFamilyDropped, msg, msg2);
+  }
+
   static Status NoSpace() { return Status(kIOError, kNoSpace); }
   static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kIOError, kNoSpace, msg, msg2);
@@ -256,6 +269,9 @@ class Status {
   // Returns true iff the status indicates the proposed compaction is too large
   bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; }
 
+  // Returns true iff the status indicates Column Family Dropped
+  bool IsColumnFamilyDropped() const { return code() == kColumnFamilyDropped; }
+
   // Returns true iff the status indicates a NoSpace error
   // This is caused by an I/O error returning the specific "out of space"
   // error condition. Stricto sensu, an NoSpace error is an I/O error
@@ -280,6 +296,12 @@ class Status {
     return (code() == kIOError) && (subcode() == kPathNotFound);
   }
 
+  // Returns true iff the status indicates manual compaction paused. This
+  // is caused by a call to PauseManualCompaction
+  bool IsManualCompactionPaused() const {
+    return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
+  }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 88fcc78ed8c..63dce41efc8 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -74,7 +74,7 @@ struct BlockBasedTableOptions {
   // blocks with high priority. If set to true, depending on implementation of
   // block cache, index and filter blocks may be less likely to be evicted
   // than data blocks.
-  bool cache_index_and_filter_blocks_with_high_priority = false;
+  bool cache_index_and_filter_blocks_with_high_priority = true;
 
   // if cache_index_and_filter_blocks is true and the below is true, then
   // filter and index blocks are stored in the cache, but a reference is
@@ -93,14 +93,32 @@ struct BlockBasedTableOptions {
   enum IndexType : char {
     // A space efficient index block that is optimized for
     // binary-search-based index.
-    kBinarySearch,
+    kBinarySearch = 0x00,
 
     // The hash index, if enabled, will do the hash lookup when
     // `Options.prefix_extractor` is provided.
-    kHashSearch,
+    kHashSearch = 0x01,
 
     // A two-level index implementation. Both levels are binary search indexes.
-    kTwoLevelIndexSearch,
+    kTwoLevelIndexSearch = 0x02,
+
+    // Like kBinarySearch, but index also contains first key of each block.
+    // This allows iterators to defer reading the block until it's actually
+    // needed. May significantly reduce read amplification of short range scans.
+    // Without it, iterator seek usually reads one block from each level-0 file
+    // and from each level, which may be expensive.
+    // Works best in combination with:
+    //  - IndexShorteningMode::kNoShortening,
+    //  - custom FlushBlockPolicy to cut blocks at some meaningful boundaries,
+    //    e.g. when prefix changes.
+    // Makes the index significantly bigger (2x or more), especially when keys
+    // are long.
+    //
+    // IO errors are not handled correctly in this mode right now: if an error
+    // happens when lazily reading a block in value(), value() returns empty
+    // slice, and you need to call Valid()/status() afterwards.
+    // TODO(kolmike): Fix it.
+    kBinarySearchWithFirstKey = 0x03,
   };
 
   IndexType index_type = kBinarySearch;
@@ -251,6 +269,9 @@ struct BlockBasedTableOptions {
   // probably use this as it would reduce the index size.
   // This option only affects newly written tables. When reading existing
   // tables, the information about version is read from the footer.
+  // 5 -- Can be read by RocksDB's versions since X.X.X (something after 6.4.6)
+  // Full and partitioned filters use a generally faster and more accurate
+  // Bloom filter implementation, with a different schema.
   uint32_t format_version = 2;
 
   // Store index blocks on disk in compressed format. Changing this option to
diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h
index 7817c564965..afff2c2ac74 100644
--- a/include/rocksdb/utilities/backupable_db.h
+++ b/include/rocksdb/utilities/backupable_db.h
@@ -10,11 +10,7 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <functional>
 #include <map>
 #include <string>
@@ -280,10 +276,14 @@ class BackupEngine {
                                        progress_callback);
   }
 
-  // deletes old backups, keeping latest num_backups_to_keep alive
+  // Deletes old backups, keeping latest num_backups_to_keep alive.
+  // See also DeleteBackup.
   virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
 
-  // deletes a specific backup
+  // Deletes a specific backup. If this operation (or PurgeOldBackups)
+  // is not completed due to crash, power failure, etc. the state
+  // will be cleaned up the next time you call DeleteBackup,
+  // PurgeOldBackups, or GarbageCollect.
   virtual Status DeleteBackup(BackupID backup_id) = 0;
 
   // Call this from another thread if you want to stop the backup
@@ -291,8 +291,8 @@ class BackupEngine {
   // not wait for the backup to stop.
   // The backup will stop ASAP and the call to CreateNewBackup will
   // return Status::Incomplete(). It will not clean up after itself, but
-  // the state will remain consistent. The state will be cleaned up
-  // next time you create BackupableDB or RestoreBackupableDB.
+  // the state will remain consistent. The state will be cleaned up the
+  // next time you call CreateNewBackup or GarbageCollect.
   virtual void StopBackup() = 0;
 
   // Returns info about backups in backup_info
@@ -327,9 +327,13 @@ class BackupEngine {
   // Returns Status::OK() if all checks are good
   virtual Status VerifyBackup(BackupID backup_id) = 0;
 
-  // Will delete all the files we don't need anymore
-  // It will do the full scan of the files/ directory and delete all the
-  // files that are not referenced.
+  // Will delete any files left over from incomplete creation or deletion of
+  // a backup. This is not normally needed as those operations also clean up
+  // after prior incomplete calls to the same kind of operation (create or
+  // delete).
+  // NOTE: This is not designed to delete arbitrary files added to the backup
+  // directory outside of BackupEngine, and clean-up is always subject to
+  // permissions on and availability of the underlying filesystem.
   virtual Status GarbageCollect() = 0;
 };
 
diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h
index aa0a394d4d0..5f12922c454 100644
--- a/include/rocksdb/utilities/checkpoint.h
+++ b/include/rocksdb/utilities/checkpoint.h
@@ -9,11 +9,15 @@
 #ifndef ROCKSDB_LITE
 
 #include <string>
+#include <vector>
 #include "rocksdb/status.h"
 
 namespace rocksdb {
 
 class DB;
+class ColumnFamilyHandle;
+struct LiveFileMetaData;
+struct ExportImportFilesMetaData;
 
 class Checkpoint {
  public:
@@ -36,6 +40,16 @@ class Checkpoint {
   virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
                                   uint64_t log_size_for_flush = 0);
 
+  // Exports all live SST files of a specified Column Family onto export_dir,
+  // returning SST files information in metadata.
+  // - SST files will be created as hard links when the directory specified
+  //   is in the same partition as the db directory, copied otherwise.
+  // - export_dir should not already exist and will be created by this API.
+  // - Always triggers a flush.
+  virtual Status ExportColumnFamily(ColumnFamilyHandle* handle,
+                                    const std::string& export_dir,
+                                    ExportImportFilesMetaData** metadata);
+
   virtual ~Checkpoint() {}
 };
 
diff --git a/include/rocksdb/utilities/debug.h b/include/rocksdb/utilities/debug.h
index 50645423d0a..3fc414b6edf 100644
--- a/include/rocksdb/utilities/debug.h
+++ b/include/rocksdb/utilities/debug.h
@@ -40,6 +40,10 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
                          size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions);
 
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+                         Slice end_key, size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions);
+
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index 57ab88a34eb..cf7d25fba2c 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -29,8 +29,10 @@ namespace rocksdb {
 class LDBCommand {
  public:
   // Command-line arguments
+  static const std::string ARG_ENV_URI;
   static const std::string ARG_DB;
   static const std::string ARG_PATH;
+  static const std::string ARG_SECONDARY_PATH;
   static const std::string ARG_HEX;
   static const std::string ARG_KEY_HEX;
   static const std::string ARG_VALUE_HEX;
@@ -127,7 +129,12 @@ class LDBCommand {
 
  protected:
   LDBCommandExecuteResult exec_state_;
+  std::string env_uri_;
   std::string db_path_;
+  // If empty, open DB as primary. If non-empty, open the DB as secondary
+  // with this secondary path. When running against a database opened by
+  // another process, ldb wll leave the source directory completely intact.
+  std::string secondary_path_;
   std::string column_family_name_;
   DB* db_;
   DBWithTTL* db_ttl_;
@@ -171,6 +178,9 @@ class LDBCommand {
   /** List of command-line options valid for this command */
   const std::vector<std::string> valid_cmd_line_options_;
 
+  /** Shared pointer to underlying environment if applicable **/
+  std::shared_ptr<Env> env_guard_;
+
   bool ParseKeyValue(const std::string& line, std::string* key,
                      std::string* value, bool is_key_hex, bool is_value_hex);
 
@@ -256,7 +266,8 @@ class LDBCommandRunner {
  public:
   static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name);
 
-  static void RunCommand(
+  // Returns the status code to return. 0 is no error.
+  static int RunCommand(
       int argc, char** argv, Options options, const LDBOptions& ldb_options,
       const std::vector<ColumnFamilyDescriptor>* column_families);
 };
diff --git a/include/rocksdb/utilities/object_registry.h b/include/rocksdb/utilities/object_registry.h
index 86a51b92ead..d1516079a61 100644
--- a/include/rocksdb/utilities/object_registry.h
+++ b/include/rocksdb/utilities/object_registry.h
@@ -11,80 +11,195 @@
 #include <memory>
 #include <regex>
 #include <string>
+#include <unordered_map>
 #include <vector>
-
-#include "rocksdb/env.h"
+#include "rocksdb/status.h"
 
 namespace rocksdb {
-
-// Creates a new T using the factory function that was registered with a pattern
-// that matches the provided "target" string according to std::regex_match.
-//
-// If no registered functions match, returns nullptr. If multiple functions
-// match, the factory function used is unspecified.
-//
-// Populates res_guard with result pointer if caller is granted ownership.
-template <typename T>
-T* NewCustomObject(const std::string& target, std::unique_ptr<T>* res_guard);
-
+class Logger;
 // Returns a new T when called with a string. Populates the std::unique_ptr
 // argument if granting ownership to caller.
 template <typename T>
-using FactoryFunc = std::function<T*(const std::string&, std::unique_ptr<T>*)>;
-
-// To register a factory function for a type T, initialize a Registrar<T> object
-// with static storage duration. For example:
-//
-//   static Registrar<Env> hdfs_reg("hdfs://.*", &CreateHdfsEnv);
-//
-// Then, calling NewCustomObject<Env>("hdfs://some_path", ...) will match the
-// regex provided above, so it returns the result of invoking CreateHdfsEnv.
-template <typename T>
-class Registrar {
+using FactoryFunc =
+    std::function<T*(const std::string&, std::unique_ptr<T>*, std::string*)>;
+
+class ObjectLibrary {
  public:
-  explicit Registrar(std::string pattern, FactoryFunc<T> factory);
-};
+  // Base class for an Entry in the Registry.
+  class Entry {
+   public:
+    virtual ~Entry() {}
+    Entry(const std::string& name) : name_(std::move(name)) {}
+
+    // Checks to see if the target matches this entry
+    virtual bool matches(const std::string& target) const {
+      return name_ == target;
+    }
+    const std::string& Name() const { return name_; }
+
+   private:
+    const std::string name_;  // The name of the Entry
+  };                          // End class Entry
+
+  // An Entry containing a FactoryFunc for creating new Objects
+  template <typename T>
+  class FactoryEntry : public Entry {
+   public:
+    FactoryEntry(const std::string& name, FactoryFunc<T> f)
+        : Entry(name), pattern_(std::move(name)), factory_(std::move(f)) {}
+    ~FactoryEntry() override {}
+    bool matches(const std::string& target) const override {
+      return std::regex_match(target, pattern_);
+    }
+    // Creates a new T object.
+    T* NewFactoryObject(const std::string& target, std::unique_ptr<T>* guard,
+                        std::string* msg) const {
+      return factory_(target, guard, msg);
+    }
 
-// Implementation details follow.
+   private:
+    std::regex pattern_;  // The pattern for this entry
+    FactoryFunc<T> factory_;
+  };  // End class FactoryEntry
+ public:
+  // Finds the entry matching the input name and type
+  const Entry* FindEntry(const std::string& type,
+                         const std::string& name) const;
+  void Dump(Logger* logger) const;
+
+  // Registers the factory with the library for the pattern.
+  // If the pattern matches, the factory may be used to create a new object.
+  template <typename T>
+  const FactoryFunc<T>& Register(const std::string& pattern,
+                                 const FactoryFunc<T>& factory) {
+    std::unique_ptr<Entry> entry(new FactoryEntry<T>(pattern, factory));
+    AddEntry(T::Type(), entry);
+    return factory;
+  }
+  // Returns the default ObjectLibrary
+  static std::shared_ptr<ObjectLibrary>& Default();
 
-namespace internal {
+ private:
+  // Adds the input entry to the list for the given type
+  void AddEntry(const std::string& type, std::unique_ptr<Entry>& entry);
 
-template <typename T>
-struct RegistryEntry {
-  std::regex pattern;
-  FactoryFunc<T> factory;
+  // ** FactoryFunctions for this loader, organized by type
+  std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>> entries_;
 };
 
-template <typename T>
-struct Registry {
-  static Registry* Get() {
-    static Registry<T> instance;
-    return &instance;
+// The ObjectRegistry is used to register objects that can be created by a
+// name/pattern at run-time where the specific implementation of the object may
+// not be known in advance.
+class ObjectRegistry {
+ public:
+  static std::shared_ptr<ObjectRegistry> NewInstance();
+
+  ObjectRegistry();
+
+  void AddLibrary(const std::shared_ptr<ObjectLibrary>& library) {
+    libraries_.emplace_back(library);
   }
-  std::vector<RegistryEntry<T>> entries;
 
- private:
-  Registry() = default;
-};
+  // Creates a new T using the factory function that was registered with a
+  // pattern that matches the provided "target" string according to
+  // std::regex_match.
+  //
+  // If no registered functions match, returns nullptr. If multiple functions
+  // match, the factory function used is unspecified.
+  //
+  // Populates res_guard with result pointer if caller is granted ownership.
+  template <typename T>
+  T* NewObject(const std::string& target, std::unique_ptr<T>* guard,
+               std::string* errmsg) {
+    guard->reset();
+    const auto* basic = FindEntry(T::Type(), target);
+    if (basic != nullptr) {
+      const auto* factory =
+          static_cast<const ObjectLibrary::FactoryEntry<T>*>(basic);
+      return factory->NewFactoryObject(target, guard, errmsg);
+    } else {
+      *errmsg = std::string("Could not load ") + T::Type();
+      return nullptr;
+    }
+  }
+
+  // Creates a new unique T using the input factory functions.
+  // Returns OK if a new unique T was successfully created
+  // Returns NotFound if the type/target could not be created
+  // Returns InvalidArgument if the factory return an unguarded object
+  //                      (meaning it cannot be managed by a unique ptr)
+  template <typename T>
+  Status NewUniqueObject(const std::string& target,
+                         std::unique_ptr<T>* result) {
+    std::string errmsg;
+    T* ptr = NewObject(target, result, &errmsg);
+    if (ptr == nullptr) {
+      return Status::NotFound(errmsg, target);
+    } else if (*result) {
+      return Status::OK();
+    } else {
+      return Status::InvalidArgument(std::string("Cannot make a unique ") +
+                                         T::Type() + " from unguarded one ",
+                                     target);
+    }
+  }
 
-}  // namespace internal
+  // Creates a new shared T using the input factory functions.
+  // Returns OK if a new shared T was successfully created
+  // Returns NotFound if the type/target could not be created
+  // Returns InvalidArgument if the factory return an unguarded object
+  //                      (meaning it cannot be managed by a shared ptr)
+  template <typename T>
+  Status NewSharedObject(const std::string& target,
+                         std::shared_ptr<T>* result) {
+    std::string errmsg;
+    std::unique_ptr<T> guard;
+    T* ptr = NewObject(target, &guard, &errmsg);
+    if (ptr == nullptr) {
+      return Status::NotFound(errmsg, target);
+    } else if (guard) {
+      result->reset(guard.release());
+      return Status::OK();
+    } else {
+      return Status::InvalidArgument(std::string("Cannot make a shared ") +
+                                         T::Type() + " from unguarded one ",
+                                     target);
+    }
+  }
 
-template <typename T>
-T* NewCustomObject(const std::string& target, std::unique_ptr<T>* res_guard) {
-  res_guard->reset();
-  for (const auto& entry : internal::Registry<T>::Get()->entries) {
-    if (std::regex_match(target, entry.pattern)) {
-      return entry.factory(target, res_guard);
+  // Creates a new static T using the input factory functions.
+  // Returns OK if a new static T was successfully created
+  // Returns NotFound if the type/target could not be created
+  // Returns InvalidArgument if the factory return a guarded object
+  //                      (meaning it is managed by a unique ptr)
+  template <typename T>
+  Status NewStaticObject(const std::string& target, T** result) {
+    std::string errmsg;
+    std::unique_ptr<T> guard;
+    T* ptr = NewObject(target, &guard, &errmsg);
+    if (ptr == nullptr) {
+      return Status::NotFound(errmsg, target);
+    } else if (guard.get()) {
+      return Status::InvalidArgument(std::string("Cannot make a static ") +
+                                         T::Type() + " from a guarded one ",
+                                     target);
+    } else {
+      *result = ptr;
+      return Status::OK();
     }
   }
-  return nullptr;
-}
 
-template <typename T>
-Registrar<T>::Registrar(std::string pattern, FactoryFunc<T> factory) {
-  internal::Registry<T>::Get()->entries.emplace_back(internal::RegistryEntry<T>{
-      std::regex(std::move(pattern)), std::move(factory)});
-}
+  // Dump the contents of the registry to the logger
+  void Dump(Logger* logger) const;
+
+ private:
+  const ObjectLibrary::Entry* FindEntry(const std::string& type,
+                                        const std::string& name) const;
 
+  // The set of libraries to search for factories for this registry.
+  // The libraries are searched in reverse order (back to front) when
+  // searching for entries.
+  std::vector<std::shared_ptr<ObjectLibrary>> libraries_;
+};
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h
index bc2a7bc13d9..fef9e9910e8 100644
--- a/include/rocksdb/utilities/sim_cache.h
+++ b/include/rocksdb/utilities/sim_cache.h
@@ -36,6 +36,10 @@ extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
                                              size_t sim_capacity,
                                              int num_shard_bits);
 
+extern std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+                                             std::shared_ptr<Cache> cache,
+                                             int num_shard_bits);
+
 class SimCache : public Cache {
  public:
   SimCache() {}
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 6e98a48e591..c0cb7e31619 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -88,6 +88,17 @@ class StackableDB : public DB {
     return db_->Get(options, column_family, key, value);
   }
 
+  using DB::GetMergeOperands;
+  virtual Status GetMergeOperands(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      const Slice& key, PinnableSlice* slice,
+      GetMergeOperandsOptions* get_merge_operands_options,
+      int* number_of_operands) override {
+    return db_->GetMergeOperands(options, column_family, key, slice,
+                                 get_merge_operands_options,
+                                 number_of_operands);
+  }
+
   using DB::MultiGet;
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
@@ -120,8 +131,22 @@ class StackableDB : public DB {
     return db_->IngestExternalFiles(args);
   }
 
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) override {
+    return db_->CreateColumnFamilyWithImport(options, column_family_name,
+                                             import_options, metadata, handle);
+  }
+
   virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
 
+  virtual Status VerifyChecksum(const ReadOptions& options) override {
+    return db_->VerifyChecksum(options);
+  }
+
   using DB::KeyMayExist;
   virtual bool KeyMayExist(const ReadOptions& options,
                            ColumnFamilyHandle* column_family, const Slice& key,
@@ -199,10 +224,11 @@ class StackableDB : public DB {
   }
 
   using DB::GetApproximateSizes;
-  virtual void GetApproximateSizes(
-      ColumnFamilyHandle* column_family, const Range* r, int n, uint64_t* sizes,
-      uint8_t include_flags = INCLUDE_FILES) override {
-    return db_->GetApproximateSizes(column_family, r, n, sizes, include_flags);
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* r, int n,
+                                     uint64_t* sizes) override {
+    return db_->GetApproximateSizes(options, column_family, r, n, sizes);
   }
 
   using DB::GetApproximateMemTableStats;
@@ -245,6 +271,13 @@ class StackableDB : public DB {
     return db_->EnableAutoCompaction(column_family_handles);
   }
 
+  virtual void EnableManualCompaction() override {
+    return db_->EnableManualCompaction();
+  }
+  virtual void DisableManualCompaction() override {
+    return db_->DisableManualCompaction();
+  }
+
   using DB::NumberLevels;
   virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
     return db_->NumberLevels(column_family);
@@ -315,6 +348,16 @@ class StackableDB : public DB {
     db_->GetColumnFamilyMetaData(column_family, cf_meta);
   }
 
+  using DB::StartBlockCacheTrace;
+  Status StartBlockCacheTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartBlockCacheTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndBlockCacheTrace;
+  Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); }
+
 #endif  // ROCKSDB_LITE
 
   virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
@@ -335,6 +378,16 @@ class StackableDB : public DB {
     return db_->GetSortedWalFiles(files);
   }
 
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) override {
+    return db_->GetCurrentWalFile(current_log_file);
+  }
+
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* creation_time) override {
+    return db_->GetCreationTimeOfOldestFile(creation_time);
+  }
+
   virtual Status DeleteFile(std::string name) override {
     return db_->DeleteFile(name);
   }
@@ -394,6 +447,12 @@ class StackableDB : public DB {
     return db_->DefaultColumnFamily();
   }
 
+#ifndef ROCKSDB_LITE
+  Status TryCatchUpWithPrimary() override {
+    return db_->TryCatchUpWithPrimary();
+  }
+#endif  // ROCKSDB_LITE
+
  protected:
   DB* db_;
   std::shared_ptr<DB> shared_db_ptr_;
diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h
index a3f9f6303cb..44ce2801952 100644
--- a/include/rocksdb/utilities/transaction.h
+++ b/include/rocksdb/utilities/transaction.h
@@ -52,6 +52,10 @@ class TransactionNotifier {
 //  -Support for using Transactions with DBWithTTL
 class Transaction {
  public:
+  // No copying allowed
+  Transaction(const Transaction&) = delete;
+  void operator=(const Transaction&) = delete;
+
   virtual ~Transaction() {}
 
   // If a transaction has a snapshot set, the transaction will ensure that
@@ -131,7 +135,7 @@ class Transaction {
   // Status::Busy() may be returned if the transaction could not guarantee
   // that there are no write conflicts.  Status::TryAgain() may be returned
   // if the memtable history size is not large enough
-  //  (See max_write_buffer_number_to_maintain).
+  //  (See max_write_buffer_size_to_maintain).
   //
   // If this transaction was created by a TransactionDB(), Status::Expired()
   // may be returned if this transaction has lived for longer than
@@ -243,7 +247,7 @@ class Transaction {
   // Status::Busy() if there is a write conflict,
   // Status::TimedOut() if a lock could not be acquired,
   // Status::TryAgain() if the memtable history size is not large enough
-  //  (See max_write_buffer_number_to_maintain)
+  //  (See max_write_buffer_size_to_maintain)
   // Status::MergeInProgress() if merge operations cannot be resolved.
   // or other errors if this key could not be read.
   virtual Status GetForUpdate(const ReadOptions& options,
@@ -320,7 +324,7 @@ class Transaction {
   // Status::Busy() if there is a write conflict,
   // Status::TimedOut() if a lock could not be acquired,
   // Status::TryAgain() if the memtable history size is not large enough
-  //  (See max_write_buffer_number_to_maintain)
+  //  (See max_write_buffer_size_to_maintain)
   // or other errors on unexpected failures.
   virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
                      const Slice& value, const bool assume_tracked = false) = 0;
@@ -522,12 +526,13 @@ class Transaction {
     id_ = id;
   }
 
+  virtual uint64_t GetLastLogNumber() const { return log_number_; }
+
  private:
   friend class PessimisticTransactionDB;
   friend class WriteUnpreparedTxnDB;
-  // No copying allowed
-  Transaction(const Transaction&);
-  void operator=(const Transaction&);
+  friend class TransactionTest_TwoPhaseLogRollingTest_Test;
+  friend class TransactionTest_TwoPhaseLogRollingTest2_Test;
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 6c4346ff3e7..91a9cec2856 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -94,14 +94,32 @@ struct TransactionDBOptions {
   // for the special way that myrocks uses this operands.
   bool rollback_merge_operands = false;
 
+  // If true, the TransactionDB implementation might skip concurrency control
+  // unless it is overridden by TransactionOptions or
+  // TransactionDBWriteOptimizations. This can be used in conjuction with
+  // DBOptions::unordered_write when the TransactionDB is used solely for write
+  // ordering rather than concurrency control.
+  bool skip_concurrency_control = false;
+
+  // This option is only valid for write unprepared. If a write batch exceeds
+  // this threshold, then the transaction will implicitly flush the currently
+  // pending writes into the database. A value of 0 or less means no limit.
+  int64_t default_write_batch_flush_threshold = 0;
+
  private:
   // 128 entries
   size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
   // 8m entry, 64MB size
   size_t wp_commit_cache_bits = static_cast<size_t>(23);
 
+  // For testing, whether transaction name should be auto-generated or not. This
+  // is useful for write unprepared which requires named transactions.
+  bool autogenerate_name = false;
+
   friend class WritePreparedTxnDB;
+  friend class WriteUnpreparedTxn;
   friend class WritePreparedTransactionTestBase;
+  friend class TransactionTestBase;
   friend class MySQLStyleTransactionTest;
 };
 
@@ -155,6 +173,11 @@ struct TransactionOptions {
   // back/commit before new transactions start.
   // Default: false
   bool skip_concurrency_control = false;
+
+  // See TransactionDBOptions::default_write_batch_flush_threshold for
+  // description. If a negative value is specified, then the default value from
+  // TransactionDBOptions is used.
+  int64_t write_batch_flush_threshold = -1;
 };
 
 // The per-write optimizations that do not involve transactions. TransactionDB
@@ -278,11 +301,9 @@ class TransactionDB : public StackableDB {
   // To Create an TransactionDB, call Open()
   // The ownership of db is transferred to the base StackableDB
   explicit TransactionDB(DB* db) : StackableDB(db) {}
-
- private:
   // No copying allowed
-  TransactionDB(const TransactionDB&);
-  void operator=(const TransactionDB&);
+  TransactionDB(const TransactionDB&) = delete;
+  void operator=(const TransactionDB&) = delete;
 };
 
 }  // namespace rocksdb
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 34e6c46895c..a0b3bac99df 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -100,6 +100,8 @@ class WriteBatchWithIndex : public WriteBatchBase {
       size_t max_bytes = 0);
 
   ~WriteBatchWithIndex() override;
+  WriteBatchWithIndex(WriteBatchWithIndex&&);
+  WriteBatchWithIndex& operator=(WriteBatchWithIndex&&);
 
   using WriteBatchBase::Put;
   Status Put(ColumnFamilyHandle* column_family, const Slice& key,
@@ -163,7 +165,8 @@ class WriteBatchWithIndex : public WriteBatchBase {
   // the write batch update finishes. The state may recover after Next() is
   // called.
   Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
-                                Iterator* base_iterator);
+                                Iterator* base_iterator,
+                                const ReadOptions* opts = nullptr);
   // default column family
   Iterator* NewIteratorWithBase(Iterator* base_iterator);
 
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 2e8b496819c..24a0527897e 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -5,8 +5,8 @@
 #pragma once
 
 #define ROCKSDB_MAJOR 6
-#define ROCKSDB_MINOR 1
-#define ROCKSDB_PATCH 1
+#define ROCKSDB_MINOR 6
+#define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 29b660d1987..0b42b9bd577 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -28,6 +28,7 @@
 #include <atomic>
 #include <memory>
 #include <string>
+#include <vector>
 #include "rocksdb/status.h"
 #include "rocksdb/write_batch_base.h"
 
@@ -60,6 +61,7 @@ struct SavePoint {
 class WriteBatch : public WriteBatchBase {
  public:
   explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0);
+  explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz);
   ~WriteBatch() override;
 
   using WriteBatchBase::Put;
@@ -269,7 +271,7 @@ class WriteBatch : public WriteBatchBase {
     virtual bool Continue();
 
    protected:
-    friend class WriteBatch;
+    friend class WriteBatchInternal;
     virtual bool WriteAfterCommit() const { return true; }
     virtual bool WriteBeforePrepare() const { return false; }
   };
@@ -282,7 +284,7 @@ class WriteBatch : public WriteBatchBase {
   size_t GetDataSize() const { return rep_.size(); }
 
   // Returns the number of updates in the batch
-  int Count() const;
+  uint32_t Count() const;
 
   // Returns true if PutCF will be called during Iterate
   bool HasPut() const;
@@ -311,6 +313,12 @@ class WriteBatch : public WriteBatchBase {
   // Returns trie if MarkRollback will be called during Iterate
   bool HasRollback() const;
 
+  // Assign timestamp to write batch
+  Status AssignTimestamp(const Slice& ts);
+
+  // Assign timestamps to write batch
+  Status AssignTimestamps(const std::vector<Slice>& ts_list);
+
   using WriteBatchBase::GetWriteBatch;
   WriteBatch* GetWriteBatch() override { return this; }
 
@@ -361,6 +369,7 @@ class WriteBatch : public WriteBatchBase {
 
  protected:
   std::string rep_;  // See comment in write_batch.cc for the format of rep_
+  const size_t timestamp_size_;
 
   // Intentionally copyable
 };
diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h
index dea904c187e..a6c204a633b 100644
--- a/include/rocksdb/write_buffer_manager.h
+++ b/include/rocksdb/write_buffer_manager.h
@@ -26,6 +26,10 @@ class WriteBufferManager {
   // the memory allocated to the cache. It can be used even if _buffer_size = 0.
   explicit WriteBufferManager(size_t _buffer_size,
                               std::shared_ptr<Cache> cache = {});
+  // No copying allowed
+  WriteBufferManager(const WriteBufferManager&) = delete;
+  WriteBufferManager& operator=(const WriteBufferManager&) = delete;
+
   ~WriteBufferManager();
 
   bool enabled() const { return buffer_size_ != 0; }
@@ -94,9 +98,5 @@ class WriteBufferManager {
 
   void ReserveMemWithCache(size_t mem);
   void FreeMemWithCache(size_t mem);
-
-  // No copying allowed
-  WriteBufferManager(const WriteBufferManager&) = delete;
-  WriteBufferManager& operator=(const WriteBufferManager&) = delete;
 };
 }  // namespace rocksdb
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index 360951834a7..b1f706c161e 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -1,5 +1,9 @@
 cmake_minimum_required(VERSION 3.4)
 
+if(${CMAKE_VERSION} VERSION_LESS "3.11.4")
+    message("Please consider switching to CMake 3.11.4 or newer")
+endif()
+
 set(JNI_NATIVE_SOURCES
         rocksjni/backupablejni.cc
         rocksjni/backupenginejni.cc
@@ -11,9 +15,9 @@ set(JNI_NATIVE_SOURCES
         rocksjni/compaction_filter.cc
         rocksjni/compaction_filter_factory.cc
         rocksjni/compaction_filter_factory_jnicallback.cc
-	rocksjni/compaction_job_info.cc
-	rocksjni/compaction_job_stats.cc
-	rocksjni/compaction_options.cc
+        rocksjni/compaction_job_info.cc
+        rocksjni/compaction_job_stats.cc
+        rocksjni/compaction_options.cc
         rocksjni/compaction_options_fifo.cc
         rocksjni/compaction_options_universal.cc
         rocksjni/compact_range_options.cc
@@ -47,6 +51,8 @@ set(JNI_NATIVE_SOURCES
         rocksjni/snapshot.cc
         rocksjni/sst_file_manager.cc
         rocksjni/sst_file_writerjni.cc
+        rocksjni/sst_file_readerjni.cc
+        rocksjni/sst_file_reader_iterator.cc
         rocksjni/statistics.cc
         rocksjni/statisticsjni.cc
         rocksjni/table.cc
@@ -72,125 +78,9 @@ set(JNI_NATIVE_SOURCES
         rocksjni/write_buffer_manager.cc
 )
 
-set(NATIVE_JAVA_CLASSES
-        org.rocksdb.AbstractCompactionFilter
-        org.rocksdb.AbstractCompactionFilterFactory
-        org.rocksdb.AbstractComparator
-        org.rocksdb.AbstractImmutableNativeReference
-        org.rocksdb.AbstractNativeReference
-        org.rocksdb.AbstractRocksIterator
-        org.rocksdb.AbstractSlice
-        org.rocksdb.AbstractTableFilter
-        org.rocksdb.AbstractTraceWriter
-        org.rocksdb.AbstractTransactionNotifier
-        org.rocksdb.AbstractWalFilter
-        org.rocksdb.BackupableDBOptions
-        org.rocksdb.BackupEngine
-        org.rocksdb.BlockBasedTableConfig
-        org.rocksdb.BloomFilter
-        org.rocksdb.CassandraCompactionFilter
-        org.rocksdb.CassandraValueMergeOperator
-        org.rocksdb.Checkpoint
-        org.rocksdb.ClockCache
-        org.rocksdb.ColumnFamilyHandle
-        org.rocksdb.ColumnFamilyOptions
-        org.rocksdb.CompactionJobInfo
-        org.rocksdb.CompactionJobStats
-        org.rocksdb.CompactionOptions
-        org.rocksdb.CompactionOptionsFIFO
-        org.rocksdb.CompactionOptionsUniversal
-        org.rocksdb.CompactRangeOptions
-        org.rocksdb.Comparator
-        org.rocksdb.ComparatorOptions
-        org.rocksdb.CompressionOptions
-        org.rocksdb.DBOptions
-        org.rocksdb.DirectComparator
-        org.rocksdb.DirectSlice
-        org.rocksdb.Env
-        org.rocksdb.EnvOptions
-        org.rocksdb.Filter
-        org.rocksdb.FlushOptions
-        org.rocksdb.HashLinkedListMemTableConfig
-        org.rocksdb.HashSkipListMemTableConfig
-        org.rocksdb.HdfsEnv
-        org.rocksdb.IngestExternalFileOptions
-        org.rocksdb.Logger
-        org.rocksdb.LRUCache
-        org.rocksdb.MemoryUtil
-        org.rocksdb.MemTableConfig
-        org.rocksdb.NativeComparatorWrapper
-        org.rocksdb.NativeLibraryLoader
-        org.rocksdb.OptimisticTransactionDB
-        org.rocksdb.OptimisticTransactionOptions
-        org.rocksdb.Options
-        org.rocksdb.OptionsUtil
-        org.rocksdb.PersistentCache
-        org.rocksdb.PlainTableConfig
-        org.rocksdb.RateLimiter
-        org.rocksdb.ReadOptions
-        org.rocksdb.RemoveEmptyValueCompactionFilter
-        org.rocksdb.RestoreOptions
-        org.rocksdb.RocksCallbackObject
-        org.rocksdb.RocksDB
-        org.rocksdb.RocksEnv
-        org.rocksdb.RocksIterator
-        org.rocksdb.RocksIteratorInterface
-        org.rocksdb.RocksMemEnv
-        org.rocksdb.RocksMutableObject
-        org.rocksdb.RocksObject
-        org.rocksdb.SkipListMemTableConfig
-        org.rocksdb.Slice
-        org.rocksdb.Snapshot
-        org.rocksdb.SstFileManager
-        org.rocksdb.SstFileWriter
-        org.rocksdb.Statistics
-        org.rocksdb.StringAppendOperator
-        org.rocksdb.TableFormatConfig
-        org.rocksdb.ThreadStatus
-        org.rocksdb.TimedEnv
-        org.rocksdb.Transaction
-        org.rocksdb.TransactionDB
-        org.rocksdb.TransactionDBOptions
-        org.rocksdb.TransactionLogIterator
-        org.rocksdb.TransactionOptions
-        org.rocksdb.TtlDB
-        org.rocksdb.UInt64AddOperator
-        org.rocksdb.VectorMemTableConfig
-        org.rocksdb.WBWIRocksIterator
-        org.rocksdb.WriteBatch
-        org.rocksdb.WriteBatch.Handler
-        org.rocksdb.WriteBatchInterface
-        org.rocksdb.WriteBatchWithIndex
-        org.rocksdb.WriteOptions
-        org.rocksdb.NativeComparatorWrapperTest
-        org.rocksdb.RocksDBExceptionTest
-        org.rocksdb.SnapshotTest
-        org.rocksdb.WriteBatchTest
-        org.rocksdb.WriteBatchTestInternalHelper
-        org.rocksdb.WriteBufferManager
-)
-
-include(FindJava)
-include(UseJava)
-include(FindJNI)
-
-include_directories(${JNI_INCLUDE_DIRS})
-include_directories(${PROJECT_SOURCE_DIR}/java)
-
-set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs)
-set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar)
-set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar)
-set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar)
-set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar)
-set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar)
-set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar)
-set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR})
-
-add_jar(
-  rocksdbjni_classes
-  SOURCES
-  src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
+set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/AbstractCompactionFilter.java
+  src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
   src/main/java/org/rocksdb/AbstractComparator.java
   src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
   src/main/java/org/rocksdb/AbstractMutableOptions.java
@@ -306,6 +196,8 @@ add_jar(
   src/main/java/org/rocksdb/SstFileManager.java
   src/main/java/org/rocksdb/SstFileMetaData.java
   src/main/java/org/rocksdb/SstFileWriter.java
+  src/main/java/org/rocksdb/SstFileReader.java
+  src/main/java/org/rocksdb/SstFileReaderIterator.java
   src/main/java/org/rocksdb/StateType.java
   src/main/java/org/rocksdb/StatisticsCollectorCallback.java
   src/main/java/org/rocksdb/StatisticsCollector.java
@@ -338,8 +230,8 @@ add_jar(
   src/main/java/org/rocksdb/WalProcessingOption.java
   src/main/java/org/rocksdb/WALRecoveryMode.java
   src/main/java/org/rocksdb/WBWIRocksIterator.java
-  src/main/java/org/rocksdb/WriteBatchInterface.java
   src/main/java/org/rocksdb/WriteBatch.java
+  src/main/java/org/rocksdb/WriteBatchInterface.java
   src/main/java/org/rocksdb/WriteBatchWithIndex.java
   src/main/java/org/rocksdb/WriteOptions.java
   src/main/java/org/rocksdb/WriteBufferManager.java
@@ -348,6 +240,10 @@ add_jar(
   src/main/java/org/rocksdb/util/Environment.java
   src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java
   src/main/java/org/rocksdb/util/SizeUnit.java
+  src/main/java/org/rocksdb/UInt64AddOperator.java
+)
+
+set(JAVA_TEST_CLASSES
   src/test/java/org/rocksdb/BackupEngineTest.java
   src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java
   src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
@@ -355,13 +251,59 @@ add_jar(
   src/test/java/org/rocksdb/RocksDBExceptionTest.java
   src/test/java/org/rocksdb/RocksMemoryResource.java
   src/test/java/org/rocksdb/SnapshotTest.java
-  src/main/java/org/rocksdb/UInt64AddOperator.java
   src/test/java/org/rocksdb/WriteBatchTest.java
   src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
   src/test/java/org/rocksdb/util/WriteBatchGetter.java
-  INCLUDE_JARS ${JAVA_TESTCLASSPATH}
 )
 
+include(FindJava)
+include(UseJava)
+find_package(JNI)
+
+include_directories(${JNI_INCLUDE_DIRS})
+include_directories(${PROJECT_SOURCE_DIR}/java)
+
+set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs)
+set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar)
+set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar)
+set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar)
+set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar)
+set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar)
+set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar)
+set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR})
+
+set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include)
+file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR})
+
+if(${Java_VERSION_MAJOR} VERSION_GREATER_EQUAL "10" AND ${CMAKE_VERSION} VERSION_LESS "3.11.4")
+  # Java 10 and newer don't have javah, but the alternative GENERATE_NATIVE_HEADERS requires CMake 3.11.4 or newer
+  message(FATAL_ERROR "Detected Java 10 or newer (${Java_VERSION_STRING}), to build with CMake please upgrade CMake to 3.11.4 or newer")
+
+elseif(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" AND ${Java_VERSION_MAJOR} STREQUAL "1"))
+  # Old CMake or Java 1.7 prepare the JAR...
+  message("Preparing Jar for Java 7")
+  add_jar(
+      rocksdbjni_classes
+      SOURCES
+      ${JAVA_MAIN_CLASSES}
+      ${JAVA_TEST_CLASSES}
+      INCLUDE_JARS ${JAVA_TESTCLASSPATH}
+  )
+
+else ()
+  # Java 1.8 or newer prepare the JAR...
+  message("Preparing Jar for JDK ${Java_VERSION_STRING}")
+  add_jar(
+      rocksdbjni_classes
+      SOURCES
+      ${JAVA_MAIN_CLASSES}
+      ${JAVA_TEST_CLASSES}
+      INCLUDE_JARS ${JAVA_TESTCLASSPATH}
+      GENERATE_NATIVE_HEADERS rocksdbjni_headers DESTINATION ${JNI_OUTPUT_DIR}
+  )
+
+endif()
+
 if(NOT EXISTS ${PROJECT_SOURCE_DIR}/java/classes)
   file(MAKE_DIRECTORY ${PROJECT_SOURCE_DIR}/java/classes)
 endif()
@@ -424,15 +366,116 @@ if(NOT EXISTS ${JAVA_ASSERTJ_JAR})
   file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR})
 endif()
 
-set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include)
+if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7" AND ${Java_VERSION_MAJOR} STREQUAL "1"))
+  # Old CMake or Java 1.7 ONLY generate JNI headers, Java 1.8+ JNI is handled in add_jar step above
+  message("Preparing JNI headers for Java 7")
+  set(NATIVE_JAVA_CLASSES
+          org.rocksdb.AbstractCompactionFilter
+          org.rocksdb.AbstractCompactionFilterFactory
+          org.rocksdb.AbstractComparator
+          org.rocksdb.AbstractImmutableNativeReference
+          org.rocksdb.AbstractNativeReference
+          org.rocksdb.AbstractRocksIterator
+          org.rocksdb.AbstractSlice
+          org.rocksdb.AbstractTableFilter
+          org.rocksdb.AbstractTraceWriter
+          org.rocksdb.AbstractTransactionNotifier
+          org.rocksdb.AbstractWalFilter
+          org.rocksdb.BackupableDBOptions
+          org.rocksdb.BackupEngine
+          org.rocksdb.BlockBasedTableConfig
+          org.rocksdb.BloomFilter
+          org.rocksdb.CassandraCompactionFilter
+          org.rocksdb.CassandraValueMergeOperator
+          org.rocksdb.Checkpoint
+          org.rocksdb.ClockCache
+          org.rocksdb.ColumnFamilyHandle
+          org.rocksdb.ColumnFamilyOptions
+          org.rocksdb.CompactionJobInfo
+          org.rocksdb.CompactionJobStats
+          org.rocksdb.CompactionOptions
+          org.rocksdb.CompactionOptionsFIFO
+          org.rocksdb.CompactionOptionsUniversal
+          org.rocksdb.CompactRangeOptions
+          org.rocksdb.Comparator
+          org.rocksdb.ComparatorOptions
+          org.rocksdb.CompressionOptions
+          org.rocksdb.DBOptions
+          org.rocksdb.DirectComparator
+          org.rocksdb.DirectSlice
+          org.rocksdb.Env
+          org.rocksdb.EnvOptions
+          org.rocksdb.Filter
+          org.rocksdb.FlushOptions
+          org.rocksdb.HashLinkedListMemTableConfig
+          org.rocksdb.HashSkipListMemTableConfig
+          org.rocksdb.HdfsEnv
+          org.rocksdb.IngestExternalFileOptions
+          org.rocksdb.Logger
+          org.rocksdb.LRUCache
+          org.rocksdb.MemoryUtil
+          org.rocksdb.MemTableConfig
+          org.rocksdb.NativeComparatorWrapper
+          org.rocksdb.NativeLibraryLoader
+          org.rocksdb.OptimisticTransactionDB
+          org.rocksdb.OptimisticTransactionOptions
+          org.rocksdb.Options
+          org.rocksdb.OptionsUtil
+          org.rocksdb.PersistentCache
+          org.rocksdb.PlainTableConfig
+          org.rocksdb.RateLimiter
+          org.rocksdb.ReadOptions
+          org.rocksdb.RemoveEmptyValueCompactionFilter
+          org.rocksdb.RestoreOptions
+          org.rocksdb.RocksCallbackObject
+          org.rocksdb.RocksDB
+          org.rocksdb.RocksEnv
+          org.rocksdb.RocksIterator
+          org.rocksdb.RocksIteratorInterface
+          org.rocksdb.RocksMemEnv
+          org.rocksdb.RocksMutableObject
+          org.rocksdb.RocksObject
+          org.rocksdb.SkipListMemTableConfig
+          org.rocksdb.Slice
+          org.rocksdb.Snapshot
+          org.rocksdb.SstFileManager
+          org.rocksdb.SstFileWriter
+          org.rocksdb.SstFileReader
+          org.rocksdb.SstFileReaderIterator
+          org.rocksdb.Statistics
+          org.rocksdb.StringAppendOperator
+          org.rocksdb.TableFormatConfig
+          org.rocksdb.ThreadStatus
+          org.rocksdb.TimedEnv
+          org.rocksdb.Transaction
+          org.rocksdb.TransactionDB
+          org.rocksdb.TransactionDBOptions
+          org.rocksdb.TransactionLogIterator
+          org.rocksdb.TransactionOptions
+          org.rocksdb.TtlDB
+          org.rocksdb.UInt64AddOperator
+          org.rocksdb.VectorMemTableConfig
+          org.rocksdb.WBWIRocksIterator
+          org.rocksdb.WriteBatch
+          org.rocksdb.WriteBatch.Handler
+          org.rocksdb.WriteBatchInterface
+          org.rocksdb.WriteBatchWithIndex
+          org.rocksdb.WriteOptions
+          org.rocksdb.NativeComparatorWrapperTest
+          org.rocksdb.RocksDBExceptionTest
+          org.rocksdb.SnapshotTest
+          org.rocksdb.WriteBatchTest
+          org.rocksdb.WriteBatchTestInternalHelper
+          org.rocksdb.WriteBufferManager
+  )
 
-file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR})
-create_javah(
-  TARGET rocksdbjni_headers
-  CLASSES ${NATIVE_JAVA_CLASSES}
-  CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH}
-  OUTPUT_DIR ${JNI_OUTPUT_DIR}
-)
+  create_javah(
+    TARGET rocksdbjni_headers
+    CLASSES ${NATIVE_JAVA_CLASSES}
+    CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH}
+    OUTPUT_DIR ${JNI_OUTPUT_DIR}
+  )
+endif()
 
 if(NOT MSVC)
   set_property(TARGET ${ROCKSDB_STATIC_LIB} PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/java/Makefile b/java/Makefile
index efc9d2b4e11..f8642c2d612 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -60,6 +60,8 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\
 	org.rocksdb.Slice\
 	org.rocksdb.SstFileManager\
 	org.rocksdb.SstFileWriter\
+	org.rocksdb.SstFileReader\
+	org.rocksdb.SstFileReaderIterator\
 	org.rocksdb.Statistics\
 	org.rocksdb.ThreadStatus\
 	org.rocksdb.TimedEnv\
@@ -156,6 +158,7 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
 	org.rocksdb.SnapshotTest\
 	org.rocksdb.SstFileManagerTest\
 	org.rocksdb.SstFileWriterTest\
+	org.rocksdb.SstFileReaderTest\
 	org.rocksdb.TableFilterTest\
 	org.rocksdb.TimedEnvTest\
 	org.rocksdb.TransactionTest\
@@ -229,12 +232,20 @@ javalib: java java_test javadocs
 
 java:
 	$(AM_V_GEN)mkdir -p $(MAIN_CLASSES)
+ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0)
 	$(AM_V_at)javac $(JAVAC_ARGS) -d $(MAIN_CLASSES)\
 		$(MAIN_SRC)/org/rocksdb/util/*.java\
 		$(MAIN_SRC)/org/rocksdb/*.java
+else
+	$(AM_V_at)javac $(JAVAC_ARGS) -h $(NATIVE_INCLUDE) -d $(MAIN_CLASSES)\
+		$(MAIN_SRC)/org/rocksdb/util/*.java\
+		$(MAIN_SRC)/org/rocksdb/*.java
+endif
 	$(AM_V_at)@cp ../HISTORY.md ./HISTORY-CPP.md
 	$(AM_V_at)@rm -f ./HISTORY-CPP.md
+ifeq ($(shell java -version 2>&1 | grep 1.7.0 > /dev/null; printf $$?), 0)
 	$(AM_V_at)javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
+endif
 
 sample: java
 	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
@@ -276,11 +287,18 @@ resolve_test_deps:
 
 java_test: java resolve_test_deps
 	$(AM_V_GEN)mkdir -p $(TEST_CLASSES)
+ifeq ($(shell java -version 2>&1|grep 1.7.0 >/dev/null; printf $$?),0)
 	$(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\
 		$(TEST_SRC)/org/rocksdb/test/*.java\
 		$(TEST_SRC)/org/rocksdb/util/*.java\
 		$(TEST_SRC)/org/rocksdb/*.java
 	$(AM_V_at)javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES)
+else
+	$(AM_V_at)javac $(JAVAC_ARGS) -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -h $(NATIVE_INCLUDE) -d $(TEST_CLASSES)\
+		$(TEST_SRC)/org/rocksdb/test/*.java\
+		$(TEST_SRC)/org/rocksdb/util/*.java\
+		$(TEST_SRC)/org/rocksdb/*.java
+endif
 
 test: java java_test run_test
 
diff --git a/java/RELEASE.md b/java/RELEASE.md
index cb9aaf987b4..dda19455f3f 100644
--- a/java/RELEASE.md
+++ b/java/RELEASE.md
@@ -1,31 +1,36 @@
 ## Cross-building
 
-RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be usd on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system.
+RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be used on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system.
 
 Building a cross-platform JAR requires:
 
- * [Vagrant](https://www.vagrantup.com/)
- * [Virtualbox](https://www.virtualbox.org/)
+ * [Docker](https://www.docker.com/docker-community)
  * A Mac OSX machine that can compile RocksDB.
  * Java 7 set as JAVA_HOME.
 
 Once you have these items, run this make command from RocksDB's root source directory:
 
-    make jclean clean rocksdbjavastaticrelease
+    make jclean clean rocksdbjavastaticreleasedocker
 
-This command will build RocksDB natively on OSX, and will then spin up two Vagrant Virtualbox Ubuntu images to build RocksDB for both 32-bit and 64-bit Linux. 
+This command will build RocksDB natively on OSX, and will then spin up docker containers to build RocksDB for 32-bit and 64-bit Linux with glibc, and 32-bit and 64-bit Linux with musl libc.
 
 You can find all native binaries and JARs in the java/target directory upon completion:
 
     librocksdbjni-linux32.so
     librocksdbjni-linux64.so
+    librocksdbjni-linux64-musl.so
+    librocksdbjni-linux32-musl.so
     librocksdbjni-osx.jnilib
-    rocksdbjni-3.5.0-javadoc.jar
-    rocksdbjni-3.5.0-linux32.jar
-    rocksdbjni-3.5.0-linux64.jar
-    rocksdbjni-3.5.0-osx.jar
-    rocksdbjni-3.5.0-sources.jar
-    rocksdbjni-3.5.0.jar
+    rocksdbjni-x.y.z-javadoc.jar
+    rocksdbjni-x.y.z-linux32.jar
+    rocksdbjni-x.y.z-linux64.jar
+    rocksdbjni-x.y.z-linux64-musl.jar
+    rocksdbjni-x.y.z-linux32-musl.jar
+    rocksdbjni-x.y.z-osx.jar
+    rocksdbjni-x.y.z-sources.jar
+    rocksdbjni-x.y.z.jar
+
+Where x.y.z is the built version number of RocksDB.
 
 ## Maven publication
 
diff --git a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
index 67f6a5cc055..ff36c74a4c8 100644
--- a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -646,8 +646,8 @@ private void run() throws RocksDBException {
               currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
           break;
         case "fillbatch":
-          tasks.add(new WriteRandomTask(
-              currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000));
+          tasks.add(
+              new WriteSequentialTask(currentTaskId++, randSeed_, num_, num_, writeOpt, 1000));
           break;
         case "fillrandom":
           tasks.add(new WriteRandomTask(
@@ -901,27 +901,23 @@ public static void main(String[] args) throws Exception {
   }
 
   private enum Flag {
-    benchmarks(
-        Arrays.asList(
-            "fillseq",
-            "readrandom",
-            "fillrandom"),
-        "Comma-separated list of operations to run in the specified order\n" +
-        "\tActual benchmarks:\n" +
-        "\t\tfillseq          -- write N values in sequential key order in async mode.\n" +
-        "\t\tfillrandom       -- write N values in random key order in async mode.\n" +
-        "\t\tfillbatch        -- write N/1000 batch where each batch has 1000 values\n" +
-        "\t\t                   in random key order in sync mode.\n" +
-        "\t\tfillsync         -- write N/100 values in random key order in sync mode.\n" +
-        "\t\tfill100K         -- write N/1000 100K values in random order in async mode.\n" +
-        "\t\treadseq          -- read N times sequentially.\n" +
-        "\t\treadrandom       -- read N times in random order.\n" +
-        "\t\treadhot          -- read N times in random order from 1% section of DB.\n" +
-        "\t\treadwhilewriting -- measure the read performance of multiple readers\n" +
-        "\t\t                   with a bg single writer.  The write rate of the bg\n" +
-        "\t\t                   is capped by --writes_per_second.\n" +
-        "\tMeta Operations:\n" +
-        "\t\tdelete            -- delete DB") {
+    benchmarks(Arrays.asList("fillseq", "readrandom", "fillrandom"),
+        "Comma-separated list of operations to run in the specified order\n"
+            + "\tActual benchmarks:\n"
+            + "\t\tfillseq          -- write N values in sequential key order in async mode.\n"
+            + "\t\tfillrandom       -- write N values in random key order in async mode.\n"
+            + "\t\tfillbatch        -- write N/1000 batch where each batch has 1000 values\n"
+            + "\t\t                   in sequential key order in sync mode.\n"
+            + "\t\tfillsync         -- write N/100 values in random key order in sync mode.\n"
+            + "\t\tfill100K         -- write N/1000 100K values in random order in async mode.\n"
+            + "\t\treadseq          -- read N times sequentially.\n"
+            + "\t\treadrandom       -- read N times in random order.\n"
+            + "\t\treadhot          -- read N times in random order from 1% section of DB.\n"
+            + "\t\treadwhilewriting -- measure the read performance of multiple readers\n"
+            + "\t\t                   with a bg single writer.  The write rate of the bg\n"
+            + "\t\t                   is capped by --writes_per_second.\n"
+            + "\tMeta Operations:\n"
+            + "\t\tdelete            -- delete DB") {
       @Override public Object parseValue(String value) {
         return new ArrayList<String>(Arrays.asList(value.split(",")));
       }
diff --git a/java/crossbuild/Vagrantfile b/java/crossbuild/Vagrantfile
index 4a321774888..0ee50de2ce1 100644
--- a/java/crossbuild/Vagrantfile
+++ b/java/crossbuild/Vagrantfile
@@ -7,11 +7,29 @@ VAGRANTFILE_API_VERSION = "2"
 Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
 
   config.vm.define "linux32" do |linux32|
-    linux32.vm.box = "hansode/centos-6.7-i386"
+    linux32.vm.box = "bento/centos-6.10-i386"
+    linux32.vm.provision :shell, path: "build-linux-centos.sh"
   end
 
   config.vm.define "linux64" do |linux64|
-    linux64.vm.box = "hansode/centos-6.7-x86_64"
+    linux64.vm.box = "bento/centos-6.10"
+    linux64.vm.provision :shell, path: "build-linux-centos.sh"
+  end
+
+  config.vm.define "linux32-musl" do |musl32|
+    musl32.vm.box = "alpine/alpine32"
+    musl32.vm.box_version = "3.6.0"
+    musl32.vm.provision :shell, path: "build-linux-alpine.sh"
+  end
+
+  config.vm.define "linux64-musl" do |musl64|
+    musl64.vm.box = "generic/alpine36"
+
+    ##  Should use the alpine/alpine64 box, but this issue needs to be fixed first - https://github.com/hashicorp/vagrant/issues/11218 
+    # musl64.vm.box = "alpine/alpine64"
+    # musl64.vm.box_version = "3.6.0"
+
+    musl64.vm.provision :shell, path: "build-linux-alpine.sh"
   end
 
   config.vm.provider "virtualbox" do |v|
@@ -20,7 +38,13 @@ Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
     v.customize ["modifyvm", :id, "--nictype1", "virtio" ]
   end
 
-  config.vm.provision :shell, path: "build-linux-centos.sh"
+  if Vagrant.has_plugin?("vagrant-cachier")
+    config.cache.scope = :box
+  end
+  if Vagrant.has_plugin?("vagrant-vbguest")
+    config.vbguest.no_install = true
+  end
+
   config.vm.synced_folder "../target", "/rocksdb-build"
   config.vm.synced_folder "../..", "/rocksdb", type: "rsync"
   config.vm.boot_timeout = 1200
diff --git a/java/crossbuild/build-linux-alpine.sh b/java/crossbuild/build-linux-alpine.sh
new file mode 100755
index 00000000000..561d34141ea
--- /dev/null
+++ b/java/crossbuild/build-linux-alpine.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+
+# update Alpine with latest versions
+echo '@edge http://nl.alpinelinux.org/alpine/edge/main' >> /etc/apk/repositories
+echo '@community http://nl.alpinelinux.org/alpine/edge/community' >> /etc/apk/repositories
+apk update
+apk upgrade
+
+# install CA certificates
+apk add ca-certificates
+
+# install build tools
+apk add \
+  build-base \
+  coreutils \
+  file \
+  git \
+  perl \
+  automake \
+  autoconf \
+  cmake
+
+# install tool dependencies for building RocksDB static library
+apk add \
+  curl \
+  bash \
+  wget \
+  tar \
+  openssl
+
+# install RocksDB dependencies
+apk add \
+  snappy snappy-dev \
+  zlib zlib-dev \
+  bzip2 bzip2-dev \
+  lz4 lz4-dev \
+  zstd zstd-dev \
+  linux-headers \
+  jemalloc jemalloc-dev
+
+# install OpenJDK7
+apk add openjdk7 \
+  && apk add java-cacerts \
+  && rm /usr/lib/jvm/java-1.7-openjdk/jre/lib/security/cacerts \
+  && ln -s /etc/ssl/certs/java/cacerts /usr/lib/jvm/java-1.7-openjdk/jre/lib/security/cacerts
+
+# cleanup
+rm -rf /var/cache/apk/*
+
+# puts javac in the PATH
+export JAVA_HOME=/usr/lib/jvm/java-1.7-openjdk
+export PATH=/usr/lib/jvm/java-1.7-openjdk/bin:$PATH
+
+# gflags from source
+cd /tmp &&\
+  git clone -b v2.0 --single-branch https://github.com/gflags/gflags.git &&\
+  cd gflags &&\
+  ./configure --prefix=/usr && make && make install &&\
+  rm -rf /tmp/*
+
+
+# build rocksdb
+cd /rocksdb
+make jclean clean
+PORTABLE=1 make -j8 rocksdbjavastatic
+cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh
index a9b5e0a9270..176e3456ce9 100755
--- a/java/crossbuild/build-linux-centos.sh
+++ b/java/crossbuild/build-linux-centos.sh
@@ -10,7 +10,11 @@ sudo rm -f /etc/yum/vars/releasever
 sudo yum -y install epel-release
 
 # install all required packages for rocksdb that are available through yum
-sudo yum -y install openssl java-1.7.0-openjdk-devel zlib-devel bzip2-devel lz4-devel snappy-devel libzstd-devel jemalloc-devel
+sudo yum -y install openssl java-1.7.0-openjdk-devel zlib-devel bzip2-devel lz4-devel snappy-devel libzstd-devel jemalloc-devel cmake3
+
+# set up cmake3 as cmake binary
+sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake 10 --slave /usr/local/bin/ctest ctest /usr/bin/ctest --slave /usr/local/bin/cpack cpack /usr/bin/cpack --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake
+sudo alternatives --install /usr/local/bin/cmake cmake /usr/bin/cmake3 20 --slave /usr/local/bin/ctest ctest /usr/bin/ctest3 --slave /usr/local/bin/cpack cpack /usr/bin/cpack3 --slave /usr/local/bin/ccmake ccmake /usr/bin/ccmake3
 
 # install gcc/g++ 4.8.2 from tru/devtools-2
 sudo wget -O /etc/yum.repos.d/devtools-2.repo https://people.centos.org/tru/devtools-2/devtools-2.repo
@@ -24,9 +28,11 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
 # set java home so we can build rocksdb jars
 export JAVA_HOME=/usr/lib/jvm/java-1.7.0
 
+export PATH=$JAVA_HOME:/usr/local/bin:$PATH
+
 # build rocksdb
 cd /rocksdb
-scl enable devtoolset-2 'make jclean clean'
+scl enable devtoolset-2 'make clean-not-downloaded'
 scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic'
 cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
 cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
diff --git a/java/crossbuild/docker-build-linux-alpine.sh b/java/crossbuild/docker-build-linux-alpine.sh
new file mode 100755
index 00000000000..e605c7716bc
--- /dev/null
+++ b/java/crossbuild/docker-build-linux-alpine.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+set -e
+#set -x
+
+# just in-case this is run outside Docker
+mkdir -p /rocksdb-local-build
+
+rm -rf /rocksdb-local-build/*
+cp -r /rocksdb-host/* /rocksdb-local-build
+cd /rocksdb-local-build
+
+make clean-not-downloaded
+PORTABLE=1 make rocksdbjavastatic
+
+cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target
+
diff --git a/java/crossbuild/docker-build-linux-centos.sh b/java/crossbuild/docker-build-linux-centos.sh
index aedcb8af788..c4217785f2e 100755
--- a/java/crossbuild/docker-build-linux-centos.sh
+++ b/java/crossbuild/docker-build-linux-centos.sh
@@ -4,26 +4,31 @@
 set -e
 #set -x
 
-rm -rf /rocksdb-local
-cp -r /rocksdb-host /rocksdb-local
-cd /rocksdb-local
+# just in-case this is run outside Docker
+mkdir -p /rocksdb-local-build
 
-# Use scl devtoolset if available (i.e. CentOS <7)
+rm -rf /rocksdb-local-build/*
+cp -r /rocksdb-host/* /rocksdb-local-build
+cd /rocksdb-local-build
+
+# Use scl devtoolset if available
 if hash scl 2>/dev/null; then
 	if scl --list | grep -q 'devtoolset-7'; then
-		scl enable devtoolset-7 'make jclean clean'
-		scl enable devtoolset-7 'PORTABLE=1 make -j6 rocksdbjavastatic'
+		# CentOS 7+
+		scl enable devtoolset-7 'make clean-not-downloaded'
+		scl enable devtoolset-7 'PORTABLE=1 make -j2 rocksdbjavastatic'
 	elif scl --list | grep -q 'devtoolset-2'; then
-		scl enable devtoolset-2 'make jclean clean'
-		scl enable devtoolset-2 'PORTABLE=1 make -j6 rocksdbjavastatic'
+		# CentOS 5 or 6
+		scl enable devtoolset-2 'make clean-not-downloaded'
+		scl enable devtoolset-2 'PORTABLE=1 make -j2 rocksdbjavastatic'
 	else
 		echo "Could not find devtoolset"
 		exit 1;
 	fi
 else
-	make jclean clean
-        PORTABLE=1 make -j6 rocksdbjavastatic
+	make clean-not-downloaded
+        PORTABLE=1 make -j2 rocksdbjavastatic
 fi
 
-cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-host/java/target
+cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-java-target
 
diff --git a/java/rocksjni.pom b/java/rocksjni.pom
index 94f07551c36..5defdca7d46 100644
--- a/java/rocksjni.pom
+++ b/java/rocksjni.pom
@@ -10,7 +10,7 @@
     <artifactId>rocksdbjni</artifactId>
     <!-- Version will be automatically replaced -->
     <version>-</version>
-    <description>RocksDB fat jar that contains .so files for linux32 and linux64, jnilib files
+    <description>RocksDB fat jar that contains .so files for linux32 and linux64 (glibc and musl-libc), jnilib files
         for Mac OSX, and a .dll for Windows x64.
     </description>
     <licenses>
diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc
index 5e9c63643de..c4c275fb529 100644
--- a/java/rocksjni/filter.cc
+++ b/java/rocksjni/filter.cc
@@ -19,10 +19,10 @@
 /*
  * Class:     org_rocksdb_BloomFilter
  * Method:    createBloomFilter
- * Signature: (IZ)J
+ * Signature: (DZ)J
  */
 jlong Java_org_rocksdb_BloomFilter_createNewBloomFilter(
-    JNIEnv* /*env*/, jclass /*jcls*/, jint bits_per_key,
+    JNIEnv* /*env*/, jclass /*jcls*/, jdouble bits_per_key,
     jboolean use_block_base_builder) {
   auto* sptr_filter = new std::shared_ptr<const rocksdb::FilterPolicy>(
       rocksdb::NewBloomFilterPolicy(bits_per_key, use_block_base_builder));
diff --git a/java/rocksjni/loggerjnicallback.cc b/java/rocksjni/loggerjnicallback.cc
index 61571e98712..a731fdac96e 100644
--- a/java/rocksjni/loggerjnicallback.cc
+++ b/java/rocksjni/loggerjnicallback.cc
@@ -131,7 +131,6 @@ void LoggerJniCallback::Logv(const InfoLogLevel log_level, const char* format,
     }
 
     assert(format != nullptr);
-    assert(ap != nullptr);
     const std::unique_ptr<char[]> msg = format_str(format, ap);
 
     // pass msg to java callback handler
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 12f44b5eb09..33d42646fad 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1547,6 +1547,27 @@ jboolean Java_org_rocksdb_Options_enablePipelinedWrite(
   return static_cast<jboolean>(opt->enable_pipelined_write);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUnorderedWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUnorderedWrite(
+    JNIEnv*, jobject, jlong jhandle, jboolean unordered_write) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+        ->unordered_write = static_cast<bool>(unordered_write);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    unorderedWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_unorderedWrite(
+        JNIEnv*, jobject, jlong jhandle) {
+    return reinterpret_cast<rocksdb::Options*>(jhandle)->unordered_write;
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setAllowConcurrentMemtableWrite
@@ -5717,6 +5738,29 @@ jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite(
   return static_cast<jboolean>(opt->enable_pipelined_write);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUnorderedWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUnorderedWrite(
+        JNIEnv*, jobject, jlong jhandle, jboolean junordered_write) {
+    auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+    opt->unordered_write = junordered_write == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    unorderedWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_unorderedWrite(
+        JNIEnv*, jobject, jlong jhandle) {
+auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+return static_cast<jboolean>(opt->unordered_write);
+}
+
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setEnableThreadTracking
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 193804ac318..e9dc3fb82b1 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -467,6 +467,8 @@ class StatusJni : public RocksDBNativeClass<rocksdb::Status*, StatusJni> {
         return 0xC;
       case rocksdb::Status::Code::kTryAgain:
         return 0xD;
+      case rocksdb::Status::Code::kColumnFamilyDropped:
+        return 0xE;
       default:
         return 0x7F;  // undefined
     }
@@ -584,6 +586,12 @@ class StatusJni : public RocksDBNativeClass<rocksdb::Status*, StatusJni> {
             new rocksdb::Status(rocksdb::Status::TryAgain(
               rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
         break;
+      case 0xE:
+        // ColumnFamilyDropped
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::ColumnFamilyDropped(
+                rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
       case 0x7F:
       default:
         return nullptr;
@@ -4612,6 +4620,8 @@ class TickerTypeJni {
         return -0x0B;
       case rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD:
         return -0x0C;
+      case rocksdb::Tickers::TXN_GET_TRY_AGAIN:
+        return -0x0D;
       case rocksdb::Tickers::TICKER_ENUM_MAX:
         // 0x5F for backwards compatibility on current minor version.
         return 0x5F;
@@ -4904,6 +4914,8 @@ class TickerTypeJni {
         return rocksdb::Tickers::TXN_DUPLICATE_KEY_OVERHEAD;
       case -0x0C:
         return rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD;
+      case -0x0D:
+        return rocksdb::Tickers::TXN_GET_TRY_AGAIN;
       case 0x5F:
         // 0x5F for backwards compatibility on current minor version.
         return rocksdb::Tickers::TICKER_ENUM_MAX;
@@ -5894,8 +5906,10 @@ class IndexTypeJni {
        return 0x0;
      case rocksdb::BlockBasedTableOptions::IndexType::kHashSearch:
        return 0x1;
-    case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch:
+     case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch:
        return 0x2;
+     case rocksdb::BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey:
+       return 0x3;
      default:
        return 0x7F;  // undefined
    }
@@ -5912,6 +5926,9 @@ class IndexTypeJni {
        return rocksdb::BlockBasedTableOptions::IndexType::kHashSearch;
      case 0x2:
        return rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+     case 0x3:
+       return rocksdb::BlockBasedTableOptions::IndexType::
+           kBinarySearchWithFirstKey;
      default:
        // undefined/default
        return rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch;
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 53224232c83..58c06fae8ad 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -18,6 +18,7 @@
 
 #include "include/org_rocksdb_RocksDB.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/types.h"
@@ -3044,3 +3045,73 @@ void Java_org_rocksdb_RocksDB_destroyDB(
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
+
+bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index,
+                      std::unique_ptr<rocksdb::Slice>& slice,
+                      std::vector<std::unique_ptr<jbyte[]>>& ranges_to_free) {
+  jobject jArray = env->GetObjectArrayElement(ranges, index);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    return false;
+  }
+
+  if (jArray == nullptr) {
+    return true;
+  }
+
+  jbyteArray jba = reinterpret_cast<jbyteArray>(jArray);
+  jsize len_ba = env->GetArrayLength(jba);
+  ranges_to_free.push_back(std::unique_ptr<jbyte[]>(new jbyte[len_ba]));
+  env->GetByteArrayRegion(jba, 0, len_ba, ranges_to_free.back().get());
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jArray);
+    return false;
+  }
+  env->DeleteLocalRef(jArray);
+  slice.reset(new rocksdb::Slice(
+      reinterpret_cast<char*>(ranges_to_free.back().get()), len_ba));
+  return true;
+}
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteFilesInRanges
+ * Signature: (JJLjava/util/List;Z)V
+ */
+JNIEXPORT void JNICALL Java_org_rocksdb_RocksDB_deleteFilesInRanges(
+    JNIEnv* env, jobject /*jdb*/, jlong jdb_handle, jlong jcf_handle,
+    jobjectArray ranges, jboolean include_end) {
+  jsize length = env->GetArrayLength(ranges);
+
+  std::vector<rocksdb::RangePtr> rangesVector;
+  std::vector<std::unique_ptr<rocksdb::Slice>> slices;
+  std::vector<std::unique_ptr<jbyte[]>> ranges_to_free;
+  for (jsize i = 0; (i + 1) < length; i += 2) {
+    slices.push_back(std::unique_ptr<rocksdb::Slice>());
+    if (!get_slice_helper(env, ranges, i, slices.back(), ranges_to_free)) {
+      // exception thrown
+      return;
+    }
+
+    slices.push_back(std::unique_ptr<rocksdb::Slice>());
+    if (!get_slice_helper(env, ranges, i + 1, slices.back(), ranges_to_free)) {
+      // exception thrown
+      return;
+    }
+
+    rangesVector.push_back(rocksdb::RangePtr(slices[slices.size() - 2].get(),
+                                             slices[slices.size() - 1].get()));
+  }
+
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* column_family =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+
+  rocksdb::Status s = rocksdb::DeleteFilesInRanges(
+      db, column_family == nullptr ? db->DefaultColumnFamily() : column_family,
+      rangesVector.data(), rangesVector.size(), include_end);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
diff --git a/java/rocksjni/sst_file_reader_iterator.cc b/java/rocksjni/sst_file_reader_iterator.cc
new file mode 100644
index 00000000000..4cbbf04bdc6
--- /dev/null
+++ b/java/rocksjni/sst_file_reader_iterator.cc
@@ -0,0 +1,191 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Iterator methods from Java side.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "include/org_rocksdb_SstFileReaderIterator.h"
+#include "rocksdb/iterator.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_disposeInternal(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong handle) {
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  assert(it != nullptr);
+  delete it;
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    isValid0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_SstFileReaderIterator_isValid0(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
+  return reinterpret_cast<rocksdb::Iterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekToFirst0(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToFirst();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekToLast0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekToLast0(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToLast();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    next0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_next0(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    prev0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_prev0(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->Prev();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seek0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seek0(JNIEnv* env, jobject /*jobj*/,
+                                                  jlong handle,
+                                                  jbyteArray jtarget,
+                                                  jint jtarget_len) {
+  jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+  if (target == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  rocksdb::Slice target_slice(reinterpret_cast<char*>(target), jtarget_len);
+
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  it->Seek(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    seekForPrev0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_seekForPrev0(JNIEnv* env,
+                                                         jobject /*jobj*/,
+                                                         jlong handle,
+                                                         jbyteArray jtarget,
+                                                         jint jtarget_len) {
+  jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+  if (target == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  rocksdb::Slice target_slice(reinterpret_cast<char*>(target), jtarget_len);
+
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  it->SeekForPrev(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    status0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReaderIterator_status0(JNIEnv* env,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Status s = it->status();
+
+  if (s.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    key0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_SstFileReaderIterator_key0(JNIEnv* env,
+                                                       jobject /*jobj*/,
+                                                       jlong handle) {
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Slice key_slice = it->key();
+
+  jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
+  if (jkey == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetByteArrayRegion(
+      jkey, 0, static_cast<jsize>(key_slice.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
+  return jkey;
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReaderIterator
+ * Method:    value0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_SstFileReaderIterator_value0(JNIEnv* env, jobject /*jobj*/,
+                                                         jlong handle) {
+    auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+    rocksdb::Slice value_slice = it->value();
+
+    jbyteArray jkeyValue =
+            env->NewByteArray(static_cast<jsize>(value_slice.size()));
+    if(jkeyValue == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+    }
+    env->SetByteArrayRegion(jkeyValue, 0, static_cast<jsize>(value_slice.size()),
+                            const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value_slice.data())));
+    return jkeyValue;
+}
\ No newline at end of file
diff --git a/java/rocksjni/sst_file_readerjni.cc b/java/rocksjni/sst_file_readerjni.cc
new file mode 100644
index 00000000000..c8348c2e256
--- /dev/null
+++ b/java/rocksjni/sst_file_readerjni.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ rocksdb::SstFileReader methods
+// from Java side.
+
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_SstFileReader.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/sst_file_reader.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    newSstFileReader
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/,
+                                                      jclass /*jcls*/,
+                                                      jlong joptions) {
+  auto *options = reinterpret_cast<const rocksdb::Options *>(joptions);
+  rocksdb::SstFileReader *sst_file_reader =
+      new rocksdb::SstFileReader(*options);
+  return reinterpret_cast<jlong>(sst_file_reader);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    open
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jobject /*jobj*/,
+                                         jlong jhandle, jstring jfile_path) {
+  const char *file_path = env->GetStringUTFChars(jfile_path, nullptr);
+  if (file_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  rocksdb::Status s =
+      reinterpret_cast<rocksdb::SstFileReader *>(jhandle)->Open(file_path);
+  env->ReleaseStringUTFChars(jfile_path, file_path);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    newIterator
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle,
+                                                 jlong jread_options_handle) {
+  auto *sst_file_reader = reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
+  auto *read_options =
+      reinterpret_cast<rocksdb::ReadOptions *>(jread_options_handle);
+  return reinterpret_cast<jlong>(sst_file_reader->NewIterator(*read_options));
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReader_disposeInternal(JNIEnv * /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong jhandle) {
+  delete reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    verifyChecksum
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle) {
+  auto *sst_file_reader = reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
+  auto s = sst_file_reader->VerifyChecksum();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_SstFileReader
+ * Method:    getTableProperties
+ * Signature: (J)J
+ */
+jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv *env,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle) {
+  auto *sst_file_reader = reinterpret_cast<rocksdb::SstFileReader *>(jhandle);
+  std::shared_ptr<const rocksdb::TableProperties> tp =
+      sst_file_reader->GetTableProperties();
+  jobject jtable_properties =
+      rocksdb::TablePropertiesJni::fromCppTableProperties(env, *(tp.get()));
+  return jtable_properties;
+}
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index 1ccc550ab62..a4504d917ab 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -85,7 +85,7 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
       std::shared_ptr<rocksdb::Cache> *pCache =
           reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(jblock_cache_handle);
       options.block_cache = *pCache;
-    } else if (jblock_cache_size > 0) {
+    } else if (jblock_cache_size >= 0) {
       if (jblock_cache_num_shard_bits > 0) {
         options.block_cache = rocksdb::NewLRUCache(
             static_cast<size_t>(jblock_cache_size),
@@ -94,6 +94,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
         options.block_cache = rocksdb::NewLRUCache(
             static_cast<size_t>(jblock_cache_size));
       }
+    } else {
+      options.no_block_cache = true;
+      options.block_cache = nullptr;
     }
   }
   if (jpersistent_cache_handle > 0) {
diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc
index f1b77446c02..c6d0b9072ae 100644
--- a/java/rocksjni/write_batch.cc
+++ b/java/rocksjni/write_batch.cc
@@ -11,6 +11,7 @@
 #include "db/write_batch_internal.h"
 #include "include/org_rocksdb_WriteBatch.h"
 #include "include/org_rocksdb_WriteBatch_Handler.h"
+#include "logging/logging.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
@@ -20,7 +21,6 @@
 #include "rocksjni/portal.h"
 #include "rocksjni/writebatchhandlerjnicallback.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/logging.h"
 
 /*
  * Class:     org_rocksdb_WriteBatch
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
index 266fb4abf74..4b51b12781b 100644
--- a/java/rocksjni/write_batch_test.cc
+++ b/java/rocksjni/write_batch_test.cc
@@ -22,8 +22,8 @@
 #include "rocksdb/write_buffer_manager.h"
 #include "rocksjni/portal.h"
 #include "table/scoped_arena_iterator.h"
+#include "test_util/testharness.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
 
 /*
  * Class:     org_rocksdb_WriteBatchTest
@@ -52,9 +52,9 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
   mem->Ref();
   std::string state;
   rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem);
-  rocksdb::Status s =
-      rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr);
-  int count = 0;
+  rocksdb::Status s = rocksdb::WriteBatchInternal::InsertInto(
+      b, &cf_mems_default, nullptr, nullptr);
+  unsigned int count = 0;
   rocksdb::Arena arena;
   rocksdb::ScopedArenaIterator iter(
       mem->NewIterator(rocksdb::ReadOptions(), &arena));
diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
index ac8550f3ef7..91e3b2fa2b0 100644
--- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
@@ -13,9 +13,8 @@
  *
  * Taken from include/rocksdb/advanced_options.h
  */
-public interface AdvancedColumnFamilyOptionsInterface
-    <T extends AdvancedColumnFamilyOptionsInterface> {
-
+public interface AdvancedColumnFamilyOptionsInterface<
+    T extends AdvancedColumnFamilyOptionsInterface<T>> {
   /**
    * The minimum number of write buffers that will be merged together
    * before writing to storage.  If set to 1, then
diff --git a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
index 3ec46712389..03a7b098352 100644
--- a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
@@ -11,9 +11,8 @@
  * Taken from include/rocksdb/advanced_options.h
  * and MutableCFOptions in util/cf_options.h
  */
-public interface AdvancedMutableColumnFamilyOptionsInterface
-    <T extends AdvancedMutableColumnFamilyOptionsInterface> {
-
+public interface AdvancedMutableColumnFamilyOptionsInterface<
+    T extends AdvancedMutableColumnFamilyOptionsInterface<T>> {
   /**
    * The maximum number of write buffers that are built up in memory.
    * The default is 2, so that when 1 write buffer is being flushed to
diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
index 4c88a0224c6..bf5c0c1a921 100644
--- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -725,7 +725,7 @@ public long blockCacheSize() {
 
   /**
    * Set the size of the cache in bytes that will be used by RocksDB.
-   * If cacheSize is non-positive, then cache will not be used.
+   * If cacheSize is negative, then cache will not be used.
    * DEFAULT: 8M
    *
    * @param blockCacheSize block cache size in bytes
diff --git a/java/src/main/java/org/rocksdb/BloomFilter.java b/java/src/main/java/org/rocksdb/BloomFilter.java
index 316c3ad838b..0a119878a46 100644
--- a/java/src/main/java/org/rocksdb/BloomFilter.java
+++ b/java/src/main/java/org/rocksdb/BloomFilter.java
@@ -20,7 +20,7 @@
  */
 public class BloomFilter extends Filter {
 
-  private static final int DEFAULT_BITS_PER_KEY = 10;
+  private static final double DEFAULT_BITS_PER_KEY = 10.0;
   private static final boolean DEFAULT_MODE = true;
 
   /**
@@ -39,7 +39,7 @@ public BloomFilter() {
    *
    * <p>
    * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
-   * is 10, which yields a filter with ~ 1% false positive rate.
+   * is 9.9, which yields a filter with ~ 1% false positive rate.
    * </p>
    * <p>
    * Callers must delete the result after any database that is using the
@@ -47,7 +47,7 @@ public BloomFilter() {
    *
    * @param bitsPerKey number of bits to use
    */
-  public BloomFilter(final int bitsPerKey) {
+  public BloomFilter(final double bitsPerKey) {
     this(bitsPerKey, DEFAULT_MODE);
   }
 
@@ -70,10 +70,10 @@ public BloomFilter(final int bitsPerKey) {
    * @param bitsPerKey number of bits to use
    * @param useBlockBasedMode use block based mode or full filter mode
    */
-  public BloomFilter(final int bitsPerKey, final boolean useBlockBasedMode) {
+  public BloomFilter(final double bitsPerKey, final boolean useBlockBasedMode) {
     super(createNewBloomFilter(bitsPerKey, useBlockBasedMode));
   }
 
-  private native static long createNewBloomFilter(final int bitsKeyKey,
+  private native static long createNewBloomFilter(final double bitsKeyKey,
       final boolean useBlockBasedMode);
 }
diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
index f88a21af2b0..6d8dfd161c6 100644
--- a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
@@ -5,10 +5,8 @@
 
 package org.rocksdb;
 
-public interface ColumnFamilyOptionsInterface
-    <T extends ColumnFamilyOptionsInterface>
-        extends AdvancedColumnFamilyOptionsInterface<T> {
-
+public interface ColumnFamilyOptionsInterface<T extends ColumnFamilyOptionsInterface<T>>
+    extends AdvancedColumnFamilyOptionsInterface<T> {
   /**
    * Use this if your DB is very small (like under 1GB) and you don't want to
    * spend lots of memory for memtables.
diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java
index e2c4c02b32e..fc413c76a68 100644
--- a/java/src/main/java/org/rocksdb/DBOptions.java
+++ b/java/src/main/java/org/rocksdb/DBOptions.java
@@ -872,6 +872,18 @@ public boolean enablePipelinedWrite() {
     return enablePipelinedWrite(nativeHandle_);
   }
 
+  @Override
+  public DBOptions setUnorderedWrite(final boolean unorderedWrite) {
+    setUnorderedWrite(nativeHandle_, unorderedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean unorderedWrite() {
+    return unorderedWrite(nativeHandle_);
+  }
+
+
   @Override
   public DBOptions setAllowConcurrentMemtableWrite(
       final boolean allowConcurrentMemtableWrite) {
@@ -1266,6 +1278,9 @@ private native void setEnableThreadTracking(long handle,
   private native void setEnablePipelinedWrite(final long handle,
       final boolean enablePipelinedWrite);
   private native boolean enablePipelinedWrite(final long handle);
+  private native void setUnorderedWrite(final long handle,
+      final boolean unorderedWrite);
+  private native boolean unorderedWrite(final long handle);
   private native void setAllowConcurrentMemtableWrite(long handle,
       boolean allowConcurrentMemtableWrite);
   private native boolean allowConcurrentMemtableWrite(long handle);
diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
index af9aa179bf4..a26449c5d38 100644
--- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -8,8 +8,7 @@
 import java.util.Collection;
 import java.util.List;
 
-public interface DBOptionsInterface<T extends DBOptionsInterface> {
-
+public interface DBOptionsInterface<T extends DBOptionsInterface<T>> {
   /**
    * Use this if your DB is very small (like under 1GB) and you don't want to
    * spend lots of memory for memtables.
@@ -1089,6 +1088,44 @@ T setNewTableReaderForCompactionInputs(
    */
   boolean enablePipelinedWrite();
 
+  /**
+   * Setting {@link #unorderedWrite()} to true trades higher write throughput with
+   * relaxing the immutability guarantee of snapshots. This violates the
+   * repeatability one expects from ::Get from a snapshot, as well as
+   * ::MultiGet and Iterator's consistent-point-in-time view property.
+   * If the application cannot tolerate the relaxed guarantees, it can implement
+   * its own mechanisms to work around that and yet benefit from the higher
+   * throughput. Using TransactionDB with WRITE_PREPARED write policy and
+   * {@link #twoWriteQueues()} true is one way to achieve immutable snapshots despite
+   * unordered_write.
+   *
+   * By default, i.e., when it is false, rocksdb does not advance the sequence
+   * number for new snapshots unless all the writes with lower sequence numbers
+   * are already finished. This provides the immutability that we except from
+   * snapshots. Moreover, since Iterator and MultiGet internally depend on
+   * snapshots, the snapshot immutability results into Iterator and MultiGet
+   * offering consistent-point-in-time view. If set to true, although
+   * Read-Your-Own-Write property is still provided, the snapshot immutability
+   * property is relaxed: the writes issued after the snapshot is obtained (with
+   * larger sequence numbers) will be still not visible to the reads from that
+   * snapshot, however, there still might be pending writes (with lower sequence
+   * number) that will change the state visible to the snapshot after they are
+   * landed to the memtable.
+   *
+   * @param unorderedWrite true to enabled unordered write
+   *
+   * @return the reference to the current options.
+   */
+  T setUnorderedWrite(final boolean unorderedWrite);
+
+  /**
+   * Returns true if unordered write are enabled.
+   * See {@link #setUnorderedWrite(boolean)}.
+   *
+   * @return true if unordered write are enabled, false otherwise.
+   */
+  boolean unorderedWrite();
+
   /**
    * If true, allow multi-writers to update mem tables in parallel.
    * Only some memtable factorys support concurrent writes; currently it
diff --git a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
index c2efcc54b6b..be3a5d483ba 100644
--- a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
@@ -5,10 +5,9 @@
 
 package org.rocksdb;
 
-public interface MutableColumnFamilyOptionsInterface
-    <T extends MutableColumnFamilyOptionsInterface>
-        extends AdvancedMutableColumnFamilyOptionsInterface<T> {
-
+public interface MutableColumnFamilyOptionsInterface<
+    T extends MutableColumnFamilyOptionsInterface<T>>
+    extends AdvancedMutableColumnFamilyOptionsInterface<T> {
   /**
    * Amount of data to build up in memory (backed by an unsorted log
    * on disk) before converting to a sorted on-disk file.
@@ -21,7 +20,7 @@ public interface MutableColumnFamilyOptionsInterface
    * Also, a larger write buffer will result in a longer recovery time
    * the next time the database is opened.
    *
-   * Default: 4MB
+   * Default: 64MB
    * @param writeBufferSize the size of write buffer.
    * @return the instance of the current object.
    * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
diff --git a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
index 1715d69d093..50347d38d53 100644
--- a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
@@ -1,8 +1,7 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 package org.rocksdb;
 
-public interface MutableDBOptionsInterface<T extends MutableDBOptionsInterface> {
-
+public interface MutableDBOptionsInterface<T extends MutableDBOptionsInterface<T>> {
   /**
    * Specifies the maximum number of concurrent background jobs (both flushes
    * and compactions combined).
diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java
index 5831b1e298e..9fce1eda24e 100644
--- a/java/src/main/java/org/rocksdb/Options.java
+++ b/java/src/main/java/org/rocksdb/Options.java
@@ -16,7 +16,7 @@
  * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
  *
  * If {@link #dispose()} function is not called, then it will be GC'd
- * automaticallyand native resources will be released as part of the process.
+ * automatically and native resources will be released as part of the process.
  */
 public class Options extends RocksObject
     implements DBOptionsInterface<Options>,
@@ -919,6 +919,17 @@ public boolean enablePipelinedWrite() {
     return enablePipelinedWrite(nativeHandle_);
   }
 
+  @Override
+  public Options setUnorderedWrite(final boolean unorderedWrite) {
+    setUnorderedWrite(nativeHandle_, unorderedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean unorderedWrite() {
+    return unorderedWrite(nativeHandle_);
+  }
+
   @Override
   public Options setAllowConcurrentMemtableWrite(
       final boolean allowConcurrentMemtableWrite) {
@@ -1886,6 +1897,9 @@ private native void setEnableThreadTracking(long handle,
   private native void setEnablePipelinedWrite(final long handle,
       final boolean pipelinedWrite);
   private native boolean enablePipelinedWrite(final long handle);
+  private native void setUnorderedWrite(final long handle,
+      final boolean unorderedWrite);
+  private native boolean unorderedWrite(final long handle);
   private native void setAllowConcurrentMemtableWrite(long handle,
       boolean allowConcurrentMemtableWrite);
   private native boolean allowConcurrentMemtableWrite(long handle);
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index b93a51e28a4..0920886c40f 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -3834,6 +3834,32 @@ public void endTrace() throws RocksDBException {
     endTrace(nativeHandle_);
   }
 
+  /*
+   * Delete files in multiple ranges at once
+   * Delete files in a lot of ranges one at a time can be slow, use this API for
+   * better performance in that case.
+   * @param columnFamily - The column family for operation (null for default)
+   * @param includeEnd - Whether ranges should include end
+   * @param ranges - pairs of ranges (from1, to1, from2, to2, ...)
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public void deleteFilesInRanges(final ColumnFamilyHandle columnFamily, final List<byte[]> ranges,
+      final boolean includeEnd) throws RocksDBException {
+    if (ranges.size() == 0) {
+      return;
+    }
+    if ((ranges.size() % 2) != 0) {
+      throw new IllegalArgumentException("Ranges size needs to be multiple of 2 "
+          + "(from1, to1, from2, to2, ...), but is " + ranges.size());
+    }
+
+    final byte[][] rangesArray = ranges.toArray(new byte[ranges.size()][]);
+
+    deleteFilesInRanges(nativeHandle_, columnFamily == null ? 0 : columnFamily.nativeHandle_,
+        rangesArray, includeEnd);
+  }
+
   /**
    * Static method to destroy the contents of the specified database.
    * Be very careful using this method.
@@ -4171,7 +4197,8 @@ private native void promoteL0(final long handle,
   private native void startTrace(final long handle, final long maxTraceFileSize,
       final long traceWriterHandle) throws RocksDBException;
   private native void endTrace(final long handle) throws RocksDBException;
-
+  private native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges,
+      boolean include_end) throws RocksDBException;
 
   private native static void destroyDB(final String path,
       final long optionsHandle) throws RocksDBException;
diff --git a/java/src/main/java/org/rocksdb/SstFileReader.java b/java/src/main/java/org/rocksdb/SstFileReader.java
new file mode 100644
index 00000000000..53f96e3cc5e
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/SstFileReader.java
@@ -0,0 +1,78 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class SstFileReader extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public SstFileReader(final Options options) {
+    super(newSstFileReader(options.nativeHandle_));
+  }
+
+  /**
+   * Returns an iterator that will iterate on all keys in the default
+   * column family including both keys in the DB and uncommitted keys in this
+   * transaction.
+   *
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read
+   * from the DB but will NOT change which keys are read from this transaction
+   * (the keys in this transaction do not yet belong to any snapshot and will be
+   * fetched regardless).
+   *
+   * Caller is responsible for deleting the returned Iterator.
+   *
+   * @param readOptions Read options.
+   *
+   * @return instance of iterator object.
+   */
+  public SstFileReaderIterator newIterator(final ReadOptions readOptions) {
+    assert (isOwningHandle());
+    long iter = newIterator(nativeHandle_, readOptions.nativeHandle_);
+    return new SstFileReaderIterator(this, iter);
+  }
+
+  /**
+   * Prepare SstFileReader to read a file.
+   *
+   * @param filePath the location of file
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void open(final String filePath) throws RocksDBException {
+    open(nativeHandle_, filePath);
+  }
+
+  /**
+   * Verify checksum
+   *
+   * @throws RocksDBException if the checksum is not valid
+   */
+  public void verifyChecksum() throws RocksDBException {
+    verifyChecksum(nativeHandle_);
+  }
+
+  /**
+   * Get the properties of the table.
+   *
+   *
+   * @return the properties
+   */
+  public TableProperties getTableProperties() throws RocksDBException {
+    return getTableProperties(nativeHandle_);
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  private native long newIterator(final long handle, final long readOptionsHandle);
+
+  private native void open(final long handle, final String filePath) throws RocksDBException;
+
+  private native static long newSstFileReader(final long optionsHandle);
+  private native void verifyChecksum(final long handle) throws RocksDBException;
+  private native TableProperties getTableProperties(final long handle) throws RocksDBException;
+}
diff --git a/java/src/main/java/org/rocksdb/SstFileReaderIterator.java b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
new file mode 100644
index 00000000000..d01b7a39031
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/SstFileReaderIterator.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * <p>An iterator that yields a sequence of key/value pairs from a source.
+ * Multiple implementations are provided by this library.
+ * In particular, iterators are provided
+ * to access the contents of a Table or a DB.</p>
+ *
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @see RocksObject
+ */
+public class SstFileReaderIterator extends AbstractRocksIterator<SstFileReader> {
+  protected SstFileReaderIterator(SstFileReader reader, long nativeHandle) {
+    super(reader, nativeHandle);
+  }
+
+  /**
+   * <p>Return the key for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   *
+   * @return key for the current entry.
+   */
+  public byte[] key() {
+    assert (isOwningHandle());
+    return key0(nativeHandle_);
+  }
+
+  /**
+   * <p>Return the value for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: !AtEnd() &amp;&amp; !AtStart()</p>
+   * @return value for the current entry.
+   */
+  public byte[] value() {
+    assert (isOwningHandle());
+    return value0(nativeHandle_);
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+  @Override final native boolean isValid0(long handle);
+  @Override final native void seekToFirst0(long handle);
+  @Override final native void seekToLast0(long handle);
+  @Override final native void next0(long handle);
+  @Override final native void prev0(long handle);
+  @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
+  @Override final native void status0(long handle) throws RocksDBException;
+
+  private native byte[] key0(long handle);
+  private native byte[] value0(long handle);
+}
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index 551e366dc53..40a642bd666 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -717,6 +717,11 @@ public enum TickerType {
      */
     TXN_SNAPSHOT_MUTEX_OVERHEAD((byte) -0x0C),
 
+    /**
+     * # of times ::Get returned TryAgain due to expired snapshot seq
+     */
+    TXN_GET_TRY_AGAIN((byte) -0x0D),
+
     TICKER_ENUM_MAX((byte) 0x5F);
 
     private final byte value;
diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java
index c019266483f..b5de34b756f 100644
--- a/java/src/main/java/org/rocksdb/util/Environment.java
+++ b/java/src/main/java/org/rocksdb/util/Environment.java
@@ -1,9 +1,26 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 package org.rocksdb.util;
 
+import java.io.File;
+import java.io.IOException;
+
 public class Environment {
   private static String OS = System.getProperty("os.name").toLowerCase();
   private static String ARCH = System.getProperty("os.arch").toLowerCase();
+  private static boolean MUSL_LIBC;
+
+  static {
+    try {
+      final Process p = new ProcessBuilder("/usr/bin/env", "sh", "-c", "ldd /usr/bin/env | grep -q musl").start();
+      MUSL_LIBC = p.waitFor() == 0;
+    } catch (final IOException | InterruptedException e) {
+      MUSL_LIBC = false;
+    }
+  }
+
+  public static boolean isAarch64() {
+    return ARCH.contains("aarch64");
+  }
 
   public static boolean isPowerPC() {
     return ARCH.contains("ppc");
@@ -34,6 +51,10 @@ public static boolean isUnix() {
         OS.contains("nux");
   }
 
+  public static boolean isMuslLibc() {
+    return MUSL_LIBC;
+  }
+
   public static boolean isSolaris() {
     return OS.contains("sunos");
   }
@@ -57,15 +78,37 @@ public static String getSharedLibraryFileName(final String name) {
     return appendLibOsSuffix("lib" + getSharedLibraryName(name), true);
   }
 
+  /**
+   * Get the name of the libc implementation
+   *
+   * @return the name of the implementation,
+   *    or null if the default for that platform (e.g. glibc on Linux).
+   */
+  public static /* @Nullable */ String getLibcName() {
+    if (isMuslLibc()) {
+      return "musl";
+    } else {
+      return null;
+    }
+  }
+
+  private static String getLibcPostfix() {
+    final String libcName = getLibcName();
+    if (libcName == null) {
+      return "";
+    }
+    return "-" + libcName;
+  }
+
   public static String getJniLibraryName(final String name) {
     if (isUnix()) {
       final String arch = is64Bit() ? "64" : "32";
-      if(isPowerPC()) {
-        return String.format("%sjni-linux-%s", name, ARCH);
-      } else if(isS390x()) {
+      if (isPowerPC() || isAarch64()) {
+        return String.format("%sjni-linux-%s%s", name, ARCH, getLibcPostfix());
+      } else if (isS390x()) {
         return String.format("%sjni-linux%s", name, ARCH);
       } else {
-        return String.format("%sjni-linux%s", name, arch);
+        return String.format("%sjni-linux%s%s", name, arch, getLibcPostfix());
       }
     } else if (isMac()) {
       return String.format("%sjni-osx", name);
diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java
index e6ebc46cd24..1731b6c270d 100644
--- a/java/src/test/java/org/rocksdb/DBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -543,6 +543,15 @@ public void enablePipelinedWrite() {
     }
   }
 
+  @Test
+  public void unordredWrite() {
+    try(final DBOptions opt = new DBOptions()) {
+      assertThat(opt.unorderedWrite()).isFalse();
+      opt.setUnorderedWrite(true);
+      assertThat(opt.unorderedWrite()).isTrue();
+    }
+  }
+
   @Test
   public void allowConcurrentMemtableWrite() {
     try (final DBOptions opt = new DBOptions()) {
diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
index e27a33d7df0..04d362fb1d2 100644
--- a/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -762,6 +762,15 @@ public void enablePipelinedWrite() {
     }
   }
 
+  @Test
+  public void unordredWrite() {
+    try(final Options opt = new Options()) {
+      assertThat(opt.unorderedWrite()).isFalse();
+      opt.setUnorderedWrite(true);
+      assertThat(opt.unorderedWrite()).isTrue();
+    }
+  }
+
   @Test
   public void allowConcurrentMemtableWrite() {
     try (final Options opt = new Options()) {
diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java
index a7d7fee14f2..8af4dcaaa46 100644
--- a/java/src/test/java/org/rocksdb/RocksDBTest.java
+++ b/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -869,6 +869,62 @@ public void compactRangeToLevel()
     }
   }
 
+  @Test
+  public void deleteFilesInRange() throws RocksDBException, InterruptedException {
+    final int KEY_SIZE = 20;
+    final int VALUE_SIZE = 1000;
+    final int FILE_SIZE = 64000;
+    final int NUM_FILES = 10;
+
+    final int KEY_INTERVAL = 10000;
+    /*
+     * Intention of these options is to end up reliably with 10 files
+     * we will be deleting using deleteFilesInRange.
+     * It is writing roughly number of keys that will fit in 10 files (target size)
+     * It is writing interleaved so that files from memory on L0 will overlap
+     * Then compaction cleans everything and we should end up with 10 files
+     */
+    try (final Options opt = new Options()
+                                 .setCreateIfMissing(true)
+                                 .setCompressionType(CompressionType.NO_COMPRESSION)
+                                 .setTargetFileSizeBase(FILE_SIZE)
+                                 .setWriteBufferSize(FILE_SIZE / 2)
+                                 .setDisableAutoCompactions(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      int records = FILE_SIZE / (KEY_SIZE + VALUE_SIZE);
+
+      // fill database with key/value pairs
+      byte[] value = new byte[VALUE_SIZE];
+      int key_init = 0;
+      for (int o = 0; o < NUM_FILES; ++o) {
+        int int_key = key_init++;
+        for (int i = 0; i < records; ++i) {
+          int_key += KEY_INTERVAL;
+          rand.nextBytes(value);
+
+          db.put(String.format("%020d", int_key).getBytes(), value);
+        }
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      db.compactRange();
+      // Make sure we do create one more L0 files.
+      assertThat(db.getProperty("rocksdb.num-files-at-level0")).isEqualTo("0");
+
+      // Should be 10, but we are OK with asserting +- 2
+      int files = Integer.parseInt(db.getProperty("rocksdb.num-files-at-level1"));
+      assertThat(files).isBetween(8, 12);
+
+      // Delete lower 60% (roughly). Result should be 5, but we are OK with asserting +- 2
+      // Important is that we know something was deleted (JNI call did something)
+      // Exact assertions are done in C++ unit tests
+      db.deleteFilesInRanges(null,
+          Arrays.asList(null, String.format("%020d", records * KEY_INTERVAL * 6 / 10).getBytes()),
+          false);
+      files = Integer.parseInt(db.getProperty("rocksdb.num-files-at-level1"));
+      assertThat(files).isBetween(3, 7);
+    }
+  }
+
   @Test
   public void compactRangeToLevelColumnFamily()
       throws RocksDBException {
diff --git a/java/src/test/java/org/rocksdb/SstFileReaderTest.java b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
new file mode 100644
index 00000000000..c0e3a73d88b
--- /dev/null
+++ b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
@@ -0,0 +1,133 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.BytewiseComparator;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public class SstFileReaderTest {
+  private static final String SST_FILE_NAME = "test.sst";
+
+  class KeyValueWithOp {
+    KeyValueWithOp(String key, String value, OpType opType) {
+      this.key = key;
+      this.value = value;
+      this.opType = opType;
+    }
+
+    String getKey() {
+      return key;
+    }
+
+    String getValue() {
+      return value;
+    }
+
+    OpType getOpType() {
+      return opType;
+    }
+
+    private String key;
+    private String value;
+    private OpType opType;
+  }
+
+  @Rule public TemporaryFolder parentFolder = new TemporaryFolder();
+
+  enum OpType { PUT, PUT_BYTES, MERGE, MERGE_BYTES, DELETE, DELETE_BYTES }
+
+  private File newSstFile(final List<KeyValueWithOp> keyValues)
+      throws IOException, RocksDBException {
+    final EnvOptions envOptions = new EnvOptions();
+    final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+    final Options options = new Options().setMergeOperator(stringAppendOperator);
+    SstFileWriter sstFileWriter;
+    sstFileWriter = new SstFileWriter(envOptions, options);
+
+    final File sstFile = parentFolder.newFile(SST_FILE_NAME);
+    try {
+      sstFileWriter.open(sstFile.getAbsolutePath());
+      for (KeyValueWithOp keyValue : keyValues) {
+        Slice keySlice = new Slice(keyValue.getKey());
+        Slice valueSlice = new Slice(keyValue.getValue());
+        byte[] keyBytes = keyValue.getKey().getBytes();
+        byte[] valueBytes = keyValue.getValue().getBytes();
+        switch (keyValue.getOpType()) {
+          case PUT:
+            sstFileWriter.put(keySlice, valueSlice);
+            break;
+          case PUT_BYTES:
+            sstFileWriter.put(keyBytes, valueBytes);
+            break;
+          case MERGE:
+            sstFileWriter.merge(keySlice, valueSlice);
+            break;
+          case MERGE_BYTES:
+            sstFileWriter.merge(keyBytes, valueBytes);
+            break;
+          case DELETE:
+            sstFileWriter.delete(keySlice);
+            break;
+          case DELETE_BYTES:
+            sstFileWriter.delete(keyBytes);
+            break;
+          default:
+            fail("Unsupported op type");
+        }
+        keySlice.close();
+        valueSlice.close();
+      }
+      sstFileWriter.finish();
+    } finally {
+      assertThat(sstFileWriter).isNotNull();
+      sstFileWriter.close();
+      options.close();
+      envOptions.close();
+    }
+    return sstFile;
+  }
+
+  @Test
+  public void readSstFile() throws RocksDBException, IOException {
+    final List<KeyValueWithOp> keyValues = new ArrayList<>();
+    keyValues.add(new KeyValueWithOp("key1", "value1", OpType.PUT));
+
+    final File sstFile = newSstFile(keyValues);
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final Options options =
+             new Options().setCreateIfMissing(true).setMergeOperator(stringAppendOperator);
+         final SstFileReader reader = new SstFileReader(options)) {
+      // Open the sst file and iterator
+      reader.open(sstFile.getAbsolutePath());
+      final ReadOptions readOptions = new ReadOptions();
+      final SstFileReaderIterator iterator = reader.newIterator(readOptions);
+
+      // Use the iterator to read sst file
+      iterator.seekToFirst();
+
+      // Verify Checksum
+      reader.verifyChecksum();
+
+      // Verify Table Properties
+      assertEquals(reader.getTableProperties().getNumEntries(), 1);
+
+      // Check key and value
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+    }
+  }
+}
diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
index 28ee04768e9..ab0ff2027a0 100644
--- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
+++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@@ -16,14 +16,17 @@
 public class EnvironmentTest {
   private final static String ARCH_FIELD_NAME = "ARCH";
   private final static String OS_FIELD_NAME = "OS";
+  private final static String MUSL_LIBC_FIELD_NAME = "MUSL_LIBC";
 
   private static String INITIAL_OS;
   private static String INITIAL_ARCH;
+  private static boolean INITIAL_MUSL_LIBC;
 
   @BeforeClass
   public static void saveState() {
     INITIAL_ARCH = getEnvironmentClassField(ARCH_FIELD_NAME);
     INITIAL_OS = getEnvironmentClassField(OS_FIELD_NAME);
+    INITIAL_MUSL_LIBC = getEnvironmentClassField(MUSL_LIBC_FIELD_NAME);
   }
 
   @Test
@@ -53,6 +56,7 @@ public void mac64() {
   @Test
   public void nix32() {
     // Linux
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Linux", "32");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
@@ -61,7 +65,17 @@ public void nix32() {
         isEqualTo("librocksdbjni-linux32.so");
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32-musl.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
     // UNIX
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Unix", "32");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
@@ -84,6 +98,7 @@ public void aix32() {
 
   @Test
   public void nix64() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Linux", "x64");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
@@ -92,7 +107,17 @@ public void nix64() {
         isEqualTo("librocksdbjni-linux64.so");
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64-musl.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
     // UNIX
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
     setEnvironmentClassFields("Unix", "x64");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
@@ -130,6 +155,62 @@ public void win64() {
       isEqualTo("librocksdbjni.dll");
   }
 
+  @Test
+  public void ppc64le() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+    setEnvironmentClassFields("Linux", "ppc64le");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isPowerPC()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-ppc64le.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    setEnvironmentClassFields("Linux", "ppc64le");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isPowerPC()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le-musl");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-ppc64le-musl.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+  }
+
+  @Test
+  public void aarch64() {
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+    setEnvironmentClassFields("Linux", "aarch64");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isAarch64()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-aarch64.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    // Linux musl-libc (Alpine)
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
+    setEnvironmentClassFields("Linux", "aarch64");
+    assertThat(Environment.isUnix()).isTrue();
+    assertThat(Environment.isAarch64()).isTrue();
+    assertThat(Environment.is64Bit()).isTrue();
+    assertThat(Environment.getJniLibraryExtension()).isEqualTo(".so");
+    assertThat(Environment.getSharedLibraryName("rocksdb")).isEqualTo("rocksdbjni");
+    assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64-musl");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-linux-aarch64-musl.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
+  }
+
   private void setEnvironmentClassFields(String osName,
       String osArch) {
     setEnvironmentClassField(OS_FIELD_NAME, osName);
@@ -140,9 +221,10 @@ private void setEnvironmentClassFields(String osName,
   public static void restoreState() {
     setEnvironmentClassField(OS_FIELD_NAME, INITIAL_OS);
     setEnvironmentClassField(ARCH_FIELD_NAME, INITIAL_ARCH);
+    setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, INITIAL_MUSL_LIBC);
   }
 
-  private static String getEnvironmentClassField(String fieldName) {
+  private static <T> T getEnvironmentClassField(String fieldName) {
     final Field field;
     try {
       field = Environment.class.getDeclaredField(fieldName);
@@ -150,13 +232,13 @@ private static String getEnvironmentClassField(String fieldName) {
       final Field modifiersField = Field.class.getDeclaredField("modifiers");
       modifiersField.setAccessible(true);
       modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL);
-      return (String)field.get(null);
+      return (T)field.get(null);
     } catch (NoSuchFieldException | IllegalAccessException e) {
       throw new RuntimeException(e);
     }
   }
 
-  private static void setEnvironmentClassField(String fieldName, String value) {
+  private static void setEnvironmentClassField(String fieldName, Object value) {
     final Field field;
     try {
       field = Environment.class.getDeclaredField(fieldName);
diff --git a/util/auto_roll_logger.cc b/logging/auto_roll_logger.cc
similarity index 59%
rename from util/auto_roll_logger.cc
rename to logging/auto_roll_logger.cc
index ae6061aed43..73a02a89957 100644
--- a/util/auto_roll_logger.cc
+++ b/logging/auto_roll_logger.cc
@@ -3,13 +3,54 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "util/auto_roll_logger.h"
+#include "logging/auto_roll_logger.h"
+
+#include <algorithm>
+#include "file/filename.h"
+#include "logging/logging.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
 // -- AutoRollLogger
+
+AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname,
+                               const std::string& db_log_dir,
+                               size_t log_max_size,
+                               size_t log_file_time_to_roll,
+                               size_t keep_log_file_num,
+                               const InfoLogLevel log_level)
+    : Logger(log_level),
+      dbname_(dbname),
+      db_log_dir_(db_log_dir),
+      env_(env),
+      status_(Status::OK()),
+      kMaxLogFileSize(log_max_size),
+      kLogFileTimeToRoll(log_file_time_to_roll),
+      kKeepLogFileNum(keep_log_file_num),
+      cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
+      ctime_(cached_now),
+      cached_now_access_count(0),
+      call_NowMicros_every_N_records_(100),
+      mutex_() {
+  Status s = env->GetAbsolutePath(dbname, &db_absolute_path_);
+  if (s.IsNotSupported()) {
+    db_absolute_path_ = dbname;
+  } else {
+    status_ = s;
+  }
+  log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
+  if (env_->FileExists(log_fname_).ok()) {
+    RollLogFile();
+  }
+  GetExistingFiles();
+  ResetLogger();
+  if (status_.ok()) {
+    status_ = TrimOldLogFiles();
+  }
+}
+
 Status AutoRollLogger::ResetLogger() {
   TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger");
   status_ = env_->NewLogger(log_fname_, &logger_);
@@ -44,6 +85,58 @@ void AutoRollLogger::RollLogFile() {
     now++;
   } while (env_->FileExists(old_fname).ok());
   env_->RenameFile(log_fname_, old_fname);
+  old_log_files_.push(old_fname);
+}
+
+void AutoRollLogger::GetExistingFiles() {
+  {
+    // Empty the queue to avoid duplicated entries in the queue.
+    std::queue<std::string> empty;
+    std::swap(old_log_files_, empty);
+  }
+
+  std::string parent_dir;
+  std::vector<std::string> info_log_files;
+  Status s =
+      GetInfoLogFiles(env_, db_log_dir_, dbname_, &parent_dir, &info_log_files);
+  if (status_.ok()) {
+    status_ = s;
+  }
+  // We need to sort the file before enqueing it so that when we
+  // delete file from the front, it is the oldest file.
+  std::sort(info_log_files.begin(), info_log_files.end());
+
+  for (const std::string& f : info_log_files) {
+    old_log_files_.push(parent_dir + "/" + f);
+  }
+}
+
+Status AutoRollLogger::TrimOldLogFiles() {
+  // Here we directly list info files and delete them through Env.
+  // The deletion isn't going through DB, so there are shortcomes:
+  // 1. the deletion is not rate limited by SstFileManager
+  // 2. there is a chance that an I/O will be issued here
+  // Since it's going to be complicated to pass DB object down to
+  // here, we take a simple approach to keep the code easier to
+  // maintain.
+
+  // old_log_files_.empty() is helpful for the corner case that
+  // kKeepLogFileNum == 0. We can instead check kKeepLogFileNum != 0 but
+  // it's essentially the same thing, and checking empty before accessing
+  // the queue feels safer.
+  while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) {
+    Status s = env_->DeleteFile(old_log_files_.front());
+    // Remove the file from the tracking anyway. It's possible that
+    // DB cleaned up the old log file, or people cleaned it up manually.
+    old_log_files_.pop();
+    // To make the file really go away, we should sync parent directory.
+    // Since there isn't any consistency issue involved here, skipping
+    // this part to avoid one I/O here.
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
 }
 
 std::string AutoRollLogger::ValistToString(const char* format,
@@ -62,6 +155,11 @@ std::string AutoRollLogger::ValistToString(const char* format,
 
 void AutoRollLogger::LogInternal(const char* format, ...) {
   mutex_.AssertHeld();
+
+  if (!logger_) {
+    return;
+  }
+
   va_list args;
   va_start(args, format);
   logger_->Logv(format, args);
@@ -70,6 +168,9 @@ void AutoRollLogger::LogInternal(const char* format, ...) {
 
 void AutoRollLogger::Logv(const char* format, va_list ap) {
   assert(GetStatus().ok());
+  if (!logger_) {
+    return;
+  }
 
   std::shared_ptr<Logger> logger;
   {
@@ -78,12 +179,19 @@ void AutoRollLogger::Logv(const char* format, va_list ap) {
         (kMaxLogFileSize > 0 && logger_->GetLogFileSize() >= kMaxLogFileSize)) {
       RollLogFile();
       Status s = ResetLogger();
+      Status s2 = TrimOldLogFiles();
+
       if (!s.ok()) {
         // can't really log the error if creating a new LOG file failed
         return;
       }
 
       WriteHeaderInfo();
+
+      if (!s2.ok()) {
+        ROCKS_LOG_WARN(logger.get(), "Fail to trim old info log file: %s",
+                       s2.ToString().c_str());
+      }
     }
 
     // pin down the current logger_ instance before releasing the mutex.
@@ -107,6 +215,10 @@ void AutoRollLogger::WriteHeaderInfo() {
 }
 
 void AutoRollLogger::LogHeader(const char* format, va_list args) {
+  if (!logger_) {
+    return;
+  }
+
   // header message are to be retained in memory. Since we cannot make any
   // assumptions about the data contained in va_list, we will retain them as
   // strings
@@ -153,7 +265,8 @@ Status CreateLoggerFromOptions(const std::string& dbname,
   if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) {
     AutoRollLogger* result = new AutoRollLogger(
         env, dbname, options.db_log_dir, options.max_log_file_size,
-        options.log_file_time_to_roll, options.info_log_level);
+        options.log_file_time_to_roll, options.keep_log_file_num,
+        options.info_log_level);
     Status s = result->GetStatus();
     if (!s.ok()) {
       delete result;
diff --git a/util/auto_roll_logger.h b/logging/auto_roll_logger.h
similarity index 83%
rename from util/auto_roll_logger.h
rename to logging/auto_roll_logger.h
index 64fce4d63e7..45cbc2697a1 100644
--- a/util/auto_roll_logger.h
+++ b/logging/auto_roll_logger.h
@@ -8,13 +8,14 @@
 
 #pragma once
 #include <list>
+#include <queue>
 #include <string>
 
+#include "file/filename.h"
 #include "port/port.h"
 #include "port/util_logger.h"
-#include "util/filename.h"
+#include "test_util/sync_point.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -24,25 +25,8 @@ class AutoRollLogger : public Logger {
  public:
   AutoRollLogger(Env* env, const std::string& dbname,
                  const std::string& db_log_dir, size_t log_max_size,
-                 size_t log_file_time_to_roll,
-                 const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
-      : Logger(log_level),
-        dbname_(dbname),
-        db_log_dir_(db_log_dir),
-        env_(env),
-        status_(Status::OK()),
-        kMaxLogFileSize(log_max_size),
-        kLogFileTimeToRoll(log_file_time_to_roll),
-        cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
-        ctime_(cached_now),
-        cached_now_access_count(0),
-        call_NowMicros_every_N_records_(100),
-        mutex_() {
-    env->GetAbsolutePath(dbname, &db_absolute_path_);
-    log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
-    RollLogFile();
-    ResetLogger();
-  }
+                 size_t log_file_time_to_roll, size_t keep_log_file_num,
+                 const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL);
 
   using Logger::Logv;
   void Logv(const char* format, va_list ap) override;
@@ -57,6 +41,10 @@ class AutoRollLogger : public Logger {
   }
 
   size_t GetLogFileSize() const override {
+    if (!logger_) {
+      return 0;
+    }
+
     std::shared_ptr<Logger> logger;
     {
       MutexLock l(&mutex_);
@@ -110,6 +98,11 @@ class AutoRollLogger : public Logger {
   bool LogExpired();
   Status ResetLogger();
   void RollLogFile();
+  // Read all names of old log files into old_log_files_
+  // If there is any error, put the error code in status_
+  void GetExistingFiles();
+  // Delete old log files if it excceeds the limit.
+  Status TrimOldLogFiles();
   // Log message to logger without rolling
   void LogInternal(const char* format, ...);
   // Serialize the va_list to a string
@@ -126,8 +119,14 @@ class AutoRollLogger : public Logger {
   Status status_;
   const size_t kMaxLogFileSize;
   const size_t kLogFileTimeToRoll;
+  const size_t kKeepLogFileNum;
   // header information
   std::list<std::string> headers_;
+  // List of all existing info log files. Used for enforcing number of
+  // info log files.
+  // Full path is stored here. It consumes signifianctly more memory
+  // than only storing file name. Can optimize if it causes a problem.
+  std::queue<std::string> old_log_files_;
   // to avoid frequent env->NowMicros() calls, we cached the current time
   uint64_t cached_now;
   uint64_t ctime_;
diff --git a/util/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc
similarity index 75%
rename from util/auto_roll_logger_test.cc
rename to logging/auto_roll_logger_test.cc
index ab9e0595808..dd279d62a25 100644
--- a/util/auto_roll_logger_test.cc
+++ b/logging/auto_roll_logger_test.cc
@@ -6,7 +6,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "util/auto_roll_logger.h"
+#include "logging/auto_roll_logger.h"
 #include <errno.h>
 #include <sys/stat.h>
 #include <algorithm>
@@ -17,11 +17,12 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
-#include "util/logging.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 namespace {
@@ -41,6 +42,21 @@ class NoSleepEnv : public EnvWrapper {
 };
 }  // namespace
 
+// In this test we only want to Log some simple log message with
+// no format. LogMessage() provides such a simple interface and
+// avoids the [format-security] warning which occurs when you
+// call ROCKS_LOG_INFO(logger, log_message) directly.
+namespace {
+void LogMessage(Logger* logger, const char* message) {
+  ROCKS_LOG_INFO(logger, "%s", message);
+}
+
+void LogMessage(const InfoLogLevel log_level, Logger* logger,
+                const char* message) {
+  Log(log_level, logger, "%s", message);
+}
+}  // namespace
+
 class AutoRollLoggerTest : public testing::Test {
  public:
   static void InitTestDb() {
@@ -62,6 +78,41 @@ class AutoRollLoggerTest : public testing::Test {
                              const std::string& log_message);
   void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time,
                              const std::string& log_message);
+  // return list of files under kTestDir that contains "LOG"
+  std::vector<std::string> GetLogFiles() {
+    std::vector<std::string> ret;
+    std::vector<std::string> files;
+    Status s = default_env->GetChildren(kTestDir, &files);
+    // Should call ASSERT_OK() here but it doesn't compile. It's not
+    // worth the time figuring out why.
+    EXPECT_TRUE(s.ok());
+    for (const auto& f : files) {
+      if (f.find("LOG") != std::string::npos) {
+        ret.push_back(f);
+      }
+    }
+    return ret;
+  }
+
+  // Delete all log files under kTestDir
+  void CleanupLogFiles() {
+    for (const std::string& f : GetLogFiles()) {
+      ASSERT_OK(default_env->DeleteFile(kTestDir + "/" + f));
+    }
+  }
+
+  void RollNTimesBySize(Logger* auto_roll_logger, size_t file_num,
+                        size_t max_log_file_size) {
+    // Roll the log 4 times, and it will trim to 3 files.
+    std::string dummy_large_string;
+    dummy_large_string.assign(max_log_file_size, '=');
+    auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    for (size_t i = 0; i < file_num + 1; i++) {
+      // Log enough bytes to trigger at least one roll.
+      LogMessage(auto_roll_logger, dummy_large_string.c_str());
+      LogMessage(auto_roll_logger, "");
+    }
+  }
 
   static const std::string kSampleMessage;
   static const std::string kTestDir;
@@ -77,21 +128,6 @@ const std::string AutoRollLoggerTest::kLogFile(
     test::PerThreadDBPath("db_log_test") + "/LOG");
 Env* AutoRollLoggerTest::default_env = Env::Default();
 
-// In this test we only want to Log some simple log message with
-// no format. LogMessage() provides such a simple interface and
-// avoids the [format-security] warning which occurs when you
-// call ROCKS_LOG_INFO(logger, log_message) directly.
-namespace {
-void LogMessage(Logger* logger, const char* message) {
-  ROCKS_LOG_INFO(logger, "%s", message);
-}
-
-void LogMessage(const InfoLogLevel log_level, Logger* logger,
-                const char* message) {
-  Log(log_level, logger, "%s", message);
-}
-}  // namespace
-
 void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger,
                                                size_t log_max_size,
                                                const std::string& log_message) {
@@ -159,8 +195,10 @@ void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger,
 TEST_F(AutoRollLoggerTest, RollLogFileBySize) {
     InitTestDb();
     size_t log_max_size = 1024 * 5;
+    size_t keep_log_file_num = 10;
 
-    AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0);
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0,
+                          keep_log_file_num);
 
     RollLogFileBySizeTest(&logger, log_max_size,
                           kSampleMessage + ":RollLogFileBySize");
@@ -171,11 +209,12 @@ TEST_F(AutoRollLoggerTest, RollLogFileByTime) {
 
   size_t time = 2;
   size_t log_size = 1024 * 5;
+  size_t keep_log_file_num = 10;
 
   InitTestDb();
   // -- Test the existence of file during the server restart.
   ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile));
-  AutoRollLogger logger(&nse, kTestDir, "", log_size, time);
+  AutoRollLogger logger(&nse, kTestDir, "", log_size, time, keep_log_file_num);
   ASSERT_OK(default_env->FileExists(kLogFile));
 
   RollLogFileByTimeTest(&nse, &logger, time,
@@ -192,28 +231,30 @@ TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) {
   // treated as "singed".
   size_t kZero = 0;
   size_t log_size = 1024;
+  size_t keep_log_file_num = 10;
 
-  AutoRollLogger* logger = new AutoRollLogger(
-    Env::Default(), kTestDir, "", log_size, 0);
+  AutoRollLogger* logger = new AutoRollLogger(Env::Default(), kTestDir, "",
+                                              log_size, 0, keep_log_file_num);
 
   LogMessage(logger, kSampleMessage.c_str());
   ASSERT_GT(logger->GetLogFileSize(), kZero);
   delete logger;
 
   // reopens the log file and an empty log file will be created.
-  logger = new AutoRollLogger(
-    Env::Default(), kTestDir, "", log_size, 0);
+  logger = new AutoRollLogger(Env::Default(), kTestDir, "", log_size, 0, 10);
   ASSERT_EQ(logger->GetLogFileSize(), kZero);
   delete logger;
 }
 
 TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
   size_t time = 2, log_max_size = 1024 * 5;
+  size_t keep_log_file_num = 10;
 
   InitTestDb();
 
   NoSleepEnv nse(Env::Default());
-  AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time);
+  AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time,
+                        keep_log_file_num);
 
   // Test the ability to roll by size
   RollLogFileBySizeTest(&logger, log_max_size,
@@ -269,6 +310,107 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
                         kSampleMessage + ":CreateLoggerFromOptions - both");
   RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll,
                         kSampleMessage + ":CreateLoggerFromOptions - both");
+
+  // Set keep_log_file_num
+  {
+    const size_t kFileNum = 3;
+    InitTestDb();
+    options.max_log_file_size = 512;
+    options.log_file_time_to_roll = 2;
+    options.keep_log_file_num = kFileNum;
+    ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
+    auto_roll_logger = dynamic_cast<AutoRollLogger*>(logger.get());
+
+    // Roll the log 4 times, and it will trim to 3 files.
+    std::string dummy_large_string;
+    dummy_large_string.assign(options.max_log_file_size, '=');
+    auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    for (size_t i = 0; i < kFileNum + 1; i++) {
+      // Log enough bytes to trigger at least one roll.
+      LogMessage(auto_roll_logger, dummy_large_string.c_str());
+      LogMessage(auto_roll_logger, "");
+    }
+
+    std::vector<std::string> files = GetLogFiles();
+    ASSERT_EQ(kFileNum, files.size());
+
+    CleanupLogFiles();
+  }
+
+  // Set keep_log_file_num and dbname is different from
+  // db_log_dir.
+  {
+    const size_t kFileNum = 3;
+    InitTestDb();
+    options.max_log_file_size = 512;
+    options.log_file_time_to_roll = 2;
+    options.keep_log_file_num = kFileNum;
+    options.db_log_dir = kTestDir;
+    ASSERT_OK(CreateLoggerFromOptions("/dummy/db/name", options, &logger));
+    auto_roll_logger = dynamic_cast<AutoRollLogger*>(logger.get());
+
+    // Roll the log 4 times, and it will trim to 3 files.
+    std::string dummy_large_string;
+    dummy_large_string.assign(options.max_log_file_size, '=');
+    auto_roll_logger->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    for (size_t i = 0; i < kFileNum + 1; i++) {
+      // Log enough bytes to trigger at least one roll.
+      LogMessage(auto_roll_logger, dummy_large_string.c_str());
+      LogMessage(auto_roll_logger, "");
+    }
+
+    std::vector<std::string> files = GetLogFiles();
+    ASSERT_EQ(kFileNum, files.size());
+    for (const auto& f : files) {
+      ASSERT_TRUE(f.find("dummy") != std::string::npos);
+    }
+
+    // Cleaning up those files.
+    CleanupLogFiles();
+  }
+}
+
+TEST_F(AutoRollLoggerTest, AutoDeleting) {
+  for (int attempt = 0; attempt < 2; attempt++) {
+    // In the first attemp, db_log_dir is not set, while in the
+    // second it is set.
+    std::string dbname = (attempt == 0) ? kTestDir : "/test/dummy/dir";
+    std::string db_log_dir = (attempt == 0) ? "" : kTestDir;
+
+    InitTestDb();
+    const size_t kMaxFileSize = 512;
+    {
+      size_t log_num = 8;
+      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
+                            log_num);
+      RollNTimesBySize(&logger, log_num, kMaxFileSize);
+
+      ASSERT_EQ(log_num, GetLogFiles().size());
+    }
+    // Shrink number of files
+    {
+      size_t log_num = 5;
+      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
+                            log_num);
+      ASSERT_EQ(log_num, GetLogFiles().size());
+
+      RollNTimesBySize(&logger, 3, kMaxFileSize);
+      ASSERT_EQ(log_num, GetLogFiles().size());
+    }
+
+    // Increase number of files again.
+    {
+      size_t log_num = 7;
+      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
+                            log_num);
+      ASSERT_EQ(6, GetLogFiles().size());
+
+      RollNTimesBySize(&logger, 3, kMaxFileSize);
+      ASSERT_EQ(log_num, GetLogFiles().size());
+    }
+
+    CleanupLogFiles();
+  }
 }
 
 TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) {
@@ -303,7 +445,7 @@ TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) {
        {"AutoRollLogger::Flush:PinnedLogger", "PosixLogger::Flush:Begin2"}});
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-  flush_thread = port::Thread ([&]() { auto_roll_logger->Flush(); });
+  flush_thread = port::Thread([&]() { auto_roll_logger->Flush(); });
   TEST_SYNC_POINT(
       "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit");
   RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size,
@@ -322,7 +464,7 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) {
   // an extra-scope to force the AutoRollLogger to flush the log file when it
   // becomes out of scope.
   {
-    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
+    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10);
     for (int log_level = InfoLogLevel::HEADER_LEVEL;
          log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
       logger.SetInfoLogLevel((InfoLogLevel)log_level);
@@ -360,7 +502,7 @@ TEST_F(AutoRollLoggerTest, Close) {
 
   size_t log_size = 8192;
   size_t log_lines = 0;
-  AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
+  AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10);
   for (int log_level = InfoLogLevel::HEADER_LEVEL;
        log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
     logger.SetInfoLogLevel((InfoLogLevel)log_level);
@@ -416,25 +558,6 @@ static std::vector<std::string> GetOldFileNames(const std::string& path) {
   return ret;
 }
 
-// Return the number of lines where a given pattern was found in the file
-static size_t GetLinesCount(const std::string& fname,
-                            const std::string& pattern) {
-  std::stringstream ssbuf;
-  std::string line;
-  size_t count = 0;
-
-  std::ifstream inFile(fname.c_str());
-  ssbuf << inFile.rdbuf();
-
-  while (getline(ssbuf, line)) {
-    if (line.find(pattern) != std::string::npos) {
-      count++;
-    }
-  }
-
-  return count;
-}
-
 TEST_F(AutoRollLoggerTest, LogHeaderTest) {
   static const size_t MAX_HEADERS = 10;
   static const size_t LOG_MAX_SIZE = 1024 * 5;
@@ -446,8 +569,9 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) {
 
     InitTestDb();
 
-    AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "",
-                          LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0);
+    AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/"",
+                          LOG_MAX_SIZE, /*log_file_time_to_roll=*/0,
+                          /*keep_log_file_num=*/10);
 
     if (test_num == 0) {
       // Log some headers explicitly using Header()
@@ -485,7 +609,7 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) {
       // verify that the files rolled over
       ASSERT_NE(oldfname, newfname);
       // verify that the old log contains all the header logs
-      ASSERT_EQ(GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS);
+      ASSERT_EQ(test::GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS);
     }
   }
 }
@@ -511,6 +635,15 @@ TEST_F(AutoRollLoggerTest, LogFileExistence) {
   delete db;
 }
 
+TEST_F(AutoRollLoggerTest, FileCreateFailure) {
+  Options options;
+  options.max_log_file_size = 100 * 1024 * 1024;
+  options.db_log_dir = "/a/dir/does/not/exist/at/all";
+
+  std::shared_ptr<Logger> logger;
+  ASSERT_NOK(CreateLoggerFromOptions("", options, &logger));
+  ASSERT_TRUE(!logger);
+}
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/logging/env_logger.h b/logging/env_logger.h
new file mode 100644
index 00000000000..7e8212dd2ec
--- /dev/null
+++ b/logging/env_logger.h
@@ -0,0 +1,165 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that uses custom Env object for logging.
+
+#pragma once
+
+#include <time.h>
+#include <atomic>
+#include <memory>
+#include "port/sys_time.h"
+
+#include "file/writable_file_writer.h"
+#include "monitoring/iostats_context_imp.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+class EnvLogger : public Logger {
+ public:
+  EnvLogger(std::unique_ptr<WritableFile>&& writable_file,
+            const std::string& fname, const EnvOptions& options, Env* env,
+            InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
+      : Logger(log_level),
+        file_(std::move(writable_file), fname, options, env),
+        last_flush_micros_(0),
+        env_(env),
+        flush_pending_(false) {}
+
+  ~EnvLogger() {
+    if (!closed_) {
+      closed_ = true;
+      CloseHelper();
+    }
+  }
+
+ private:
+  void FlushLocked() {
+    mutex_.AssertHeld();
+    if (flush_pending_) {
+      flush_pending_ = false;
+      file_.Flush();
+    }
+    last_flush_micros_ = env_->NowMicros();
+  }
+
+  void Flush() override {
+    TEST_SYNC_POINT("EnvLogger::Flush:Begin1");
+    TEST_SYNC_POINT("EnvLogger::Flush:Begin2");
+
+    MutexLock l(&mutex_);
+    FlushLocked();
+  }
+
+  Status CloseImpl() override { return CloseHelper(); }
+
+  Status CloseHelper() {
+    mutex_.Lock();
+    const auto close_status = file_.Close();
+    mutex_.Unlock();
+
+    if (close_status.ok()) {
+      return close_status;
+    }
+    return Status::IOError("Close of log file failed with error:" +
+                           (close_status.getState()
+                                ? std::string(close_status.getState())
+                                : std::string()));
+  }
+
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    IOSTATS_TIMER_GUARD(logger_nanos);
+
+    const uint64_t thread_id = env_->GetThreadID();
+
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 65536;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+
+      struct timeval now_tv;
+      gettimeofday(&now_tv, nullptr);
+      const time_t seconds = now_tv.tv_sec;
+      struct tm t;
+      localtime_r(&seconds, &t);
+      p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+                    t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                    t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec),
+                    static_cast<long long unsigned int>(thread_id));
+
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;  // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+
+      // Add newline if necessary
+      if (p == base || p[-1] != '\n') {
+        *p++ = '\n';
+      }
+
+      assert(p <= limit);
+      mutex_.Lock();
+      // We will ignore any error returned by Append().
+      file_.Append(Slice(base, p - base));
+      flush_pending_ = true;
+      const uint64_t now_micros = env_->NowMicros();
+      if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+        FlushLocked();
+      }
+      mutex_.Unlock();
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+  }
+
+  size_t GetLogFileSize() const override {
+    MutexLock l(&mutex_);
+    return file_.GetFileSize();
+  }
+
+ private:
+  WritableFileWriter file_;
+  mutable port::Mutex mutex_;  // Mutex to protect the shared variables below.
+  const static uint64_t flush_every_seconds_ = 5;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  Env* env_;
+  std::atomic<bool> flush_pending_;
+};
+
+}  // namespace rocksdb
diff --git a/logging/env_logger_test.cc b/logging/env_logger_test.cc
new file mode 100644
index 00000000000..6e1af2d5590
--- /dev/null
+++ b/logging/env_logger_test.cc
@@ -0,0 +1,162 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "logging/env_logger.h"
+#include "env/mock_env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace rocksdb {
+
+namespace {
+// In this test we only want to Log some simple log message with
+// no format.
+void LogMessage(std::shared_ptr<Logger> logger, const std::string& message) {
+  Log(logger, "%s", message.c_str());
+}
+
+// Helper method to write the message num_times in the given logger.
+void WriteLogs(std::shared_ptr<Logger> logger, const std::string& message,
+               int num_times) {
+  for (int ii = 0; ii < num_times; ++ii) {
+    LogMessage(logger, message);
+  }
+}
+
+}  // namespace
+
+class EnvLoggerTest : public testing::Test {
+ public:
+  Env* env_;
+
+  EnvLoggerTest() : env_(Env::Default()) {}
+
+  ~EnvLoggerTest() = default;
+
+  std::shared_ptr<Logger> CreateLogger() {
+    std::shared_ptr<Logger> result;
+    assert(NewEnvLogger(kLogFile, env_, &result).ok());
+    assert(result);
+    result->SetInfoLogLevel(InfoLogLevel::INFO_LEVEL);
+    return result;
+  }
+
+  void DeleteLogFile() { ASSERT_OK(env_->DeleteFile(kLogFile)); }
+
+  static const std::string kSampleMessage;
+  static const std::string kTestDir;
+  static const std::string kLogFile;
+};
+
+const std::string EnvLoggerTest::kSampleMessage =
+    "this is the message to be written to the log file!!";
+const std::string EnvLoggerTest::kLogFile = test::PerThreadDBPath("log_file");
+
+TEST_F(EnvLoggerTest, EmptyLogFile) {
+  auto logger = CreateLogger();
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Check the size of the log file.
+  uint64_t file_size;
+  ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK());
+  ASSERT_EQ(file_size, 0);
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, LogMultipleLines) {
+  auto logger = CreateLogger();
+
+  // Write multiple lines.
+  const int kNumIter = 10;
+  WriteLogs(logger, kSampleMessage, kNumIter);
+
+  // Flush the logs.
+  logger->Flush();
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Validate whether the log file has 'kNumIter' number of lines.
+  ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter);
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, Overwrite) {
+  {
+    auto logger = CreateLogger();
+
+    // Write multiple lines.
+    const int kNumIter = 10;
+    WriteLogs(logger, kSampleMessage, kNumIter);
+
+    ASSERT_EQ(logger->Close(), Status::OK());
+
+    // Validate whether the log file has 'kNumIter' number of lines.
+    ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter);
+  }
+
+  // Now reopen the file again.
+  {
+    auto logger = CreateLogger();
+
+    // File should be empty.
+    uint64_t file_size;
+    ASSERT_EQ(env_->GetFileSize(kLogFile, &file_size), Status::OK());
+    ASSERT_EQ(file_size, 0);
+    ASSERT_EQ(logger->GetLogFileSize(), 0);
+    ASSERT_EQ(logger->Close(), Status::OK());
+  }
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, Close) {
+  auto logger = CreateLogger();
+
+  // Write multiple lines.
+  const int kNumIter = 10;
+  WriteLogs(logger, kSampleMessage, kNumIter);
+
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Validate whether the log file has 'kNumIter' number of lines.
+  ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage), kNumIter);
+  DeleteLogFile();
+}
+
+TEST_F(EnvLoggerTest, ConcurrentLogging) {
+  auto logger = CreateLogger();
+
+  const int kNumIter = 20;
+  std::function<void()> cb = [&]() {
+    WriteLogs(logger, kSampleMessage, kNumIter);
+    logger->Flush();
+  };
+
+  // Write to the logs from multiple threads.
+  std::vector<port::Thread> threads;
+  const int kNumThreads = 5;
+  // Create threads.
+  for (int ii = 0; ii < kNumThreads; ++ii) {
+    threads.push_back(port::Thread(cb));
+  }
+
+  // Wait for them to complete.
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  ASSERT_EQ(logger->Close(), Status::OK());
+
+  // Verfiy the log file.
+  ASSERT_EQ(test::GetLinesCount(kLogFile, kSampleMessage),
+            kNumIter * kNumThreads);
+  DeleteLogFile();
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/util/event_logger.cc b/logging/event_logger.cc
similarity index 91%
rename from util/event_logger.cc
rename to logging/event_logger.cc
index b488984f350..4ae9d2d66c1 100644
--- a/util/event_logger.cc
+++ b/logging/event_logger.cc
@@ -3,18 +3,14 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include "util/event_logger.h"
+#include "logging/event_logger.h"
 
-#include <inttypes.h>
 #include <cassert>
+#include <cinttypes>
 #include <sstream>
 #include <string>
 
-#include "util/logging.h"
+#include "logging/logging.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/util/event_logger.h b/logging/event_logger.h
similarity index 99%
rename from util/event_logger.h
rename to logging/event_logger.h
index d88a6a4fe68..c3a7c30c601 100644
--- a/util/event_logger.h
+++ b/logging/event_logger.h
@@ -10,8 +10,8 @@
 #include <string>
 #include <chrono>
 
+#include "logging/log_buffer.h"
 #include "rocksdb/env.h"
-#include "util/log_buffer.h"
 
 namespace rocksdb {
 
diff --git a/util/event_logger_test.cc b/logging/event_logger_test.cc
similarity index 94%
rename from util/event_logger_test.cc
rename to logging/event_logger_test.cc
index 4bcf30ff5eb..cc635d42fbf 100644
--- a/util/event_logger_test.cc
+++ b/logging/event_logger_test.cc
@@ -5,8 +5,8 @@
 
 #include <string>
 
-#include "util/event_logger.h"
-#include "util/testharness.h"
+#include "logging/event_logger.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/log_buffer.cc b/logging/log_buffer.cc
similarity index 98%
rename from util/log_buffer.cc
rename to logging/log_buffer.cc
index d09e0cb002f..74db11c66e3 100644
--- a/util/log_buffer.cc
+++ b/logging/log_buffer.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/log_buffer.h"
+#include "logging/log_buffer.h"
 
 #include "port/sys_time.h"
 #include "port/port.h"
diff --git a/util/log_buffer.h b/logging/log_buffer.h
similarity index 98%
rename from util/log_buffer.h
rename to logging/log_buffer.h
index e356b93a746..16fb243117d 100644
--- a/util/log_buffer.h
+++ b/logging/log_buffer.h
@@ -5,11 +5,11 @@
 
 #pragma once
 
+#include <ctime>
+#include "memory/arena.h"
+#include "port/sys_time.h"
 #include "rocksdb/env.h"
-#include "util/arena.h"
 #include "util/autovector.h"
-#include "port/sys_time.h"
-#include <ctime>
 
 namespace rocksdb {
 
diff --git a/util/logging.h b/logging/logging.h
similarity index 98%
rename from util/logging.h
rename to logging/logging.h
index a4ef31bd6b5..cad90a309f1 100644
--- a/util/logging.h
+++ b/logging/logging.h
@@ -19,7 +19,7 @@
 
 inline const char* RocksLogShorterFileName(const char* file)
 {
-  // 15 is the length of "util/logging.h".
+  // 15 is the length of "logging/logging.h".
   // If the name of this file changed, please change this number, too.
   return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0);
 }
diff --git a/env/posix_logger.h b/logging/posix_logger.h
similarity index 99%
rename from env/posix_logger.h
rename to logging/posix_logger.h
index 401df6a3ffb..8406a6d8acc 100644
--- a/env/posix_logger.h
+++ b/logging/posix_logger.h
@@ -27,7 +27,7 @@
 #include "env/io_posix.h"
 #include "monitoring/iostats_context_imp.h"
 #include "rocksdb/env.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/allocator.h b/memory/allocator.h
similarity index 94%
rename from util/allocator.h
rename to memory/allocator.h
index 505d6ba2bbf..619cd66a5fd 100644
--- a/util/allocator.h
+++ b/memory/allocator.h
@@ -33,6 +33,10 @@ class Allocator {
 class AllocTracker {
  public:
   explicit AllocTracker(WriteBufferManager* write_buffer_manager);
+  // No copying allowed
+  AllocTracker(const AllocTracker&) = delete;
+  void operator=(const AllocTracker&) = delete;
+
   ~AllocTracker();
   void Allocate(size_t bytes);
   // Call when we're finished allocating memory so we can free it from
@@ -48,10 +52,6 @@ class AllocTracker {
   std::atomic<size_t> bytes_allocated_;
   bool done_allocating_;
   bool freed_;
-
-  // No copying allowed
-  AllocTracker(const AllocTracker&);
-  void operator=(const AllocTracker&);
 };
 
 }  // namespace rocksdb
diff --git a/util/arena.cc b/memory/arena.cc
similarity index 97%
rename from util/arena.cc
rename to memory/arena.cc
index d7799eb266a..70c8039015b 100644
--- a/util/arena.cc
+++ b/memory/arena.cc
@@ -7,22 +7,16 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/arena.h"
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
+#include "memory/arena.h"
 #ifndef OS_WIN
 #include <sys/mman.h>
 #endif
 #include <algorithm>
+#include "logging/logging.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
-#include "util/logging.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/arena.h b/memory/arena.h
similarity index 99%
rename from util/arena.h
rename to memory/arena.h
index dc64154c857..fd97f57e1e5 100644
--- a/util/arena.h
+++ b/memory/arena.h
@@ -15,12 +15,12 @@
 #ifndef OS_WIN
 #include <sys/mman.h>
 #endif
-#include <cstddef>
-#include <cerrno>
-#include <vector>
 #include <assert.h>
 #include <stdint.h>
-#include "util/allocator.h"
+#include <cerrno>
+#include <cstddef>
+#include <vector>
+#include "memory/allocator.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/util/arena_test.cc b/memory/arena_test.cc
similarity index 99%
rename from util/arena_test.cc
rename to memory/arena_test.cc
index 9dfc28ab2ea..18296d307d0 100644
--- a/util/arena_test.cc
+++ b/memory/arena_test.cc
@@ -7,9 +7,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/arena.h"
+#include "memory/arena.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/concurrent_arena.cc b/memory/concurrent_arena.cc
similarity index 97%
rename from util/concurrent_arena.cc
rename to memory/concurrent_arena.cc
index cef77d7e75f..722eb3b60bd 100644
--- a/util/concurrent_arena.cc
+++ b/memory/concurrent_arena.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/concurrent_arena.h"
+#include "memory/concurrent_arena.h"
 #include <thread>
 #include "port/port.h"
 #include "util/random.h"
diff --git a/util/concurrent_arena.h b/memory/concurrent_arena.h
similarity index 99%
rename from util/concurrent_arena.h
rename to memory/concurrent_arena.h
index a6191100fd0..6b41ab02470 100644
--- a/util/concurrent_arena.h
+++ b/memory/concurrent_arena.h
@@ -11,9 +11,9 @@
 #include <atomic>
 #include <memory>
 #include <utility>
+#include "memory/allocator.h"
+#include "memory/arena.h"
 #include "port/likely.h"
-#include "util/allocator.h"
-#include "util/arena.h"
 #include "util/core_local.h"
 #include "util/mutexlock.h"
 #include "util/thread_local.h"
diff --git a/util/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc
similarity index 99%
rename from util/jemalloc_nodump_allocator.cc
rename to memory/jemalloc_nodump_allocator.cc
index cdd08e932e3..1f58351bef6 100644
--- a/util/jemalloc_nodump_allocator.cc
+++ b/memory/jemalloc_nodump_allocator.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/jemalloc_nodump_allocator.h"
+#include "memory/jemalloc_nodump_allocator.h"
 
 #include <string>
 #include <thread>
diff --git a/util/jemalloc_nodump_allocator.h b/memory/jemalloc_nodump_allocator.h
similarity index 98%
rename from util/jemalloc_nodump_allocator.h
rename to memory/jemalloc_nodump_allocator.h
index e93c1223778..f997a3b8120 100644
--- a/util/jemalloc_nodump_allocator.h
+++ b/memory/jemalloc_nodump_allocator.h
@@ -11,7 +11,6 @@
 #include "port/jemalloc_helper.h"
 #include "port/port.h"
 #include "rocksdb/memory_allocator.h"
-#include "util/core_local.h"
 #include "util/thread_local.h"
 
 #if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)
diff --git a/util/memory_allocator.h b/memory/memory_allocator.h
similarity index 100%
rename from util/memory_allocator.h
rename to memory/memory_allocator.h
diff --git a/util/memory_usage.h b/memory/memory_usage.h
similarity index 100%
rename from util/memory_usage.h
rename to memory/memory_usage.h
diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc
index a1fa4938c52..ddd40aa059f 100644
--- a/memtable/alloc_tracker.cc
+++ b/memtable/alloc_tracker.cc
@@ -8,9 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <assert.h>
+#include "memory/allocator.h"
+#include "memory/arena.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "util/allocator.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 
diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc
index 878d2338356..5c906165e0a 100644
--- a/memtable/hash_linklist_rep.cc
+++ b/memtable/hash_linklist_rep.cc
@@ -10,13 +10,13 @@
 #include <algorithm>
 #include <atomic>
 #include "db/memtable.h"
+#include "memory/arena.h"
 #include "memtable/skiplist.h"
 #include "monitoring/histogram.h"
 #include "port/port.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "util/arena.h"
 #include "util/hash.h"
 
 namespace rocksdb {
@@ -218,8 +218,7 @@ class HashLinkListRep : public MemTableRep {
   }
 
   size_t GetHash(const Slice& slice) const {
-    return NPHash64(slice.data(), static_cast<int>(slice.size()), 0) %
-           bucket_size_;
+    return fastrange64(GetSliceNPHash64(slice), bucket_size_);
   }
 
   Pointer* GetBucket(size_t i) const {
diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc
index d02919cd4ef..5c74657cd31 100644
--- a/memtable/hash_skiplist_rep.cc
+++ b/memtable/hash_skiplist_rep.cc
@@ -9,14 +9,14 @@
 
 #include <atomic>
 
+#include "db/memtable.h"
+#include "memory/arena.h"
+#include "memtable/skiplist.h"
+#include "port/port.h"
 #include "rocksdb/memtablerep.h"
-#include "util/arena.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "port/port.h"
 #include "util/murmurhash.h"
-#include "db/memtable.h"
-#include "memtable/skiplist.h"
 
 namespace rocksdb {
 namespace {
diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h
index 1ef8f2b6dbc..91ab3d75460 100644
--- a/memtable/inlineskiplist.h
+++ b/memtable/inlineskiplist.h
@@ -46,10 +46,10 @@
 #include <algorithm>
 #include <atomic>
 #include <type_traits>
+#include "memory/allocator.h"
 #include "port/likely.h"
 #include "port/port.h"
 #include "rocksdb/slice.h"
-#include "util/allocator.h"
 #include "util/coding.h"
 #include "util/random.h"
 
@@ -74,6 +74,9 @@ class InlineSkipList {
   explicit InlineSkipList(Comparator cmp, Allocator* allocator,
                           int32_t max_height = 12,
                           int32_t branching_factor = 4);
+  // No copying allowed
+  InlineSkipList(const InlineSkipList&) = delete;
+  InlineSkipList& operator=(const InlineSkipList&) = delete;
 
   // Allocates a key and a skip-list node, returning a pointer to the key
   // portion of the node.  This method is thread-safe if the allocator
@@ -83,6 +86,9 @@ class InlineSkipList {
   // Allocate a splice using allocator.
   Splice* AllocateSplice();
 
+  // Allocate a splice on heap.
+  Splice* AllocateSpliceOnHeap();
+
   // Inserts a key allocated by AllocateKey, after the actual key value
   // has been filled in.
   //
@@ -102,6 +108,12 @@ class InlineSkipList {
   // REQUIRES: no concurrent calls to any of inserts.
   bool InsertWithHint(const char* key, void** hint);
 
+  // Like InsertConcurrently, but with a hint
+  //
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  // REQUIRES: no concurrent calls that use same hint
+  bool InsertWithHintConcurrently(const char* key, void** hint);
+
   // Like Insert, but external synchronization is not required.
   bool InsertConcurrently(const char* key);
 
@@ -254,10 +266,6 @@ class InlineSkipList {
   // lowest_level (inclusive).
   void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice,
                              int recompute_level);
-
-  // No copying allowed
-  InlineSkipList(const InlineSkipList&);
-  InlineSkipList& operator=(const InlineSkipList&);
 };
 
 // Implementation details follow
@@ -643,6 +651,18 @@ InlineSkipList<Comparator>::AllocateSplice() {
   return splice;
 }
 
+template <class Comparator>
+typename InlineSkipList<Comparator>::Splice*
+InlineSkipList<Comparator>::AllocateSpliceOnHeap() {
+  size_t array_size = sizeof(Node*) * (kMaxHeight_ + 1);
+  char* raw = new char[sizeof(Splice) + array_size * 2];
+  Splice* splice = reinterpret_cast<Splice*>(raw);
+  splice->height_ = 0;
+  splice->prev_ = reinterpret_cast<Node**>(raw + sizeof(Splice));
+  splice->next_ = reinterpret_cast<Node**>(raw + sizeof(Splice) + array_size);
+  return splice;
+}
+
 template <class Comparator>
 bool InlineSkipList<Comparator>::Insert(const char* key) {
   return Insert<false>(key, seq_splice_, false);
@@ -669,6 +689,18 @@ bool InlineSkipList<Comparator>::InsertWithHint(const char* key, void** hint) {
   return Insert<false>(key, splice, true);
 }
 
+template <class Comparator>
+bool InlineSkipList<Comparator>::InsertWithHintConcurrently(const char* key,
+                                                            void** hint) {
+  assert(hint != nullptr);
+  Splice* splice = reinterpret_cast<Splice*>(*hint);
+  if (splice == nullptr) {
+    splice = AllocateSpliceOnHeap();
+    *hint = reinterpret_cast<void*>(splice);
+  }
+  return Insert<true>(key, splice, true);
+}
+
 template <class Comparator>
 template <bool prefetch_before>
 void InlineSkipList<Comparator>::FindSpliceForLevel(const DecodedKey& key,
diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc
index b416ef7c557..a3ae4149877 100644
--- a/memtable/inlineskiplist_test.cc
+++ b/memtable/inlineskiplist_test.cc
@@ -10,11 +10,11 @@
 #include "memtable/inlineskiplist.h"
 #include <set>
 #include <unordered_set>
+#include "memory/concurrent_arena.h"
 #include "rocksdb/env.h"
-#include "util/concurrent_arena.h"
+#include "test_util/testharness.h"
 #include "util/hash.h"
 #include "util/random.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
@@ -412,12 +412,18 @@ class ConcurrentTest {
   }
 
   // REQUIRES: No concurrent calls for the same k
-  void ConcurrentWriteStep(uint32_t k) {
+  void ConcurrentWriteStep(uint32_t k, bool use_hint = false) {
     const int g = current_.Get(k) + 1;
     const Key new_key = MakeKey(k, g);
     char* buf = list_.AllocateKey(sizeof(Key));
     memcpy(buf, &new_key, sizeof(Key));
-    list_.InsertConcurrently(buf);
+    if (use_hint) {
+      void* hint = nullptr;
+      list_.InsertWithHintConcurrently(buf, &hint);
+      delete[] reinterpret_cast<char*>(hint);
+    } else {
+      list_.InsertConcurrently(buf);
+    }
     ASSERT_EQ(g, current_.Get(k) + 1);
     current_.Set(k, g);
   }
@@ -508,6 +514,7 @@ TEST_F(InlineSkipTest, ConcurrentInsertWithoutThreads) {
 class TestState {
  public:
   ConcurrentTest t_;
+  bool use_hint_;
   int seed_;
   std::atomic<bool> quit_flag_;
   std::atomic<uint32_t> next_writer_;
@@ -575,7 +582,7 @@ static void ConcurrentReader(void* arg) {
 static void ConcurrentWriter(void* arg) {
   TestState* state = reinterpret_cast<TestState*>(arg);
   uint32_t k = state->next_writer_++ % ConcurrentTest::K;
-  state->t_.ConcurrentWriteStep(k);
+  state->t_.ConcurrentWriteStep(k, state->use_hint_);
   state->AdjustPendingWriters(-1);
 }
 
@@ -600,7 +607,8 @@ static void RunConcurrentRead(int run) {
   }
 }
 
-static void RunConcurrentInsert(int run, int write_parallelism = 4) {
+static void RunConcurrentInsert(int run, bool use_hint = false,
+                                int write_parallelism = 4) {
   Env::Default()->SetBackgroundThreads(1 + write_parallelism,
                                        Env::Priority::LOW);
   const int seed = test::RandomSeed() + (run * 100);
@@ -612,6 +620,7 @@ static void RunConcurrentInsert(int run, int write_parallelism = 4) {
       fprintf(stderr, "Run %d of %d\n", i, N);
     }
     TestState state(seed + 1);
+    state.use_hint_ = use_hint;
     Env::Default()->Schedule(ConcurrentReader, &state);
     state.Wait(TestState::RUNNING);
     for (int k = 0; k < kSize; k += write_parallelism) {
@@ -635,6 +644,15 @@ TEST_F(InlineSkipTest, ConcurrentRead5) { RunConcurrentRead(5); }
 TEST_F(InlineSkipTest, ConcurrentInsert1) { RunConcurrentInsert(1); }
 TEST_F(InlineSkipTest, ConcurrentInsert2) { RunConcurrentInsert(2); }
 TEST_F(InlineSkipTest, ConcurrentInsert3) { RunConcurrentInsert(3); }
+TEST_F(InlineSkipTest, ConcurrentInsertWithHint1) {
+  RunConcurrentInsert(1, true);
+}
+TEST_F(InlineSkipTest, ConcurrentInsertWithHint2) {
+  RunConcurrentInsert(2, true);
+}
+TEST_F(InlineSkipTest, ConcurrentInsertWithHint3) {
+  RunConcurrentInsert(3, true);
+}
 
 #endif  // ROCKSDB_VALGRIND_RUN
 }  // namespace rocksdb
diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc
index 51ff11a015c..1e2b5bdd1e5 100644
--- a/memtable/memtablerep_bench.cc
+++ b/memtable/memtablerep_bench.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
@@ -28,6 +24,7 @@ int main() {
 
 #include "db/dbformat.h"
 #include "db/memtable.h"
+#include "memory/arena.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
@@ -35,11 +32,10 @@ int main() {
 #include "rocksdb/options.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "util/arena.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
-#include "util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::RegisterFlagValidator;
diff --git a/memtable/skiplist.h b/memtable/skiplist.h
index 47a89034eb9..5edfc10b7cb 100644
--- a/memtable/skiplist.h
+++ b/memtable/skiplist.h
@@ -32,10 +32,10 @@
 
 #pragma once
 #include <assert.h>
-#include <atomic>
 #include <stdlib.h>
+#include <atomic>
+#include "memory/allocator.h"
 #include "port/port.h"
-#include "util/allocator.h"
 #include "util/random.h"
 
 namespace rocksdb {
@@ -51,6 +51,9 @@ class SkipList {
   // allocator must remain allocated for the lifetime of the skiplist object.
   explicit SkipList(Comparator cmp, Allocator* allocator,
                     int32_t max_height = 12, int32_t branching_factor = 4);
+  // No copying allowed
+  SkipList(const SkipList&) = delete;
+  void operator=(const SkipList&) = delete;
 
   // Insert key into the list.
   // REQUIRES: nothing that compares equal to key is currently in the list.
@@ -158,10 +161,6 @@ class SkipList {
   // Return the last node in the list.
   // Return head_ if list is empty.
   Node* FindLast() const;
-
-  // No copying allowed
-  SkipList(const SkipList&);
-  void operator=(const SkipList&);
 };
 
 // Implementation details follow
diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc
index 50c3588bb86..33cc19b2d38 100644
--- a/memtable/skiplist_test.cc
+++ b/memtable/skiplist_test.cc
@@ -9,11 +9,11 @@
 
 #include "memtable/skiplist.h"
 #include <set>
+#include "memory/arena.h"
 #include "rocksdb/env.h"
-#include "util/arena.h"
+#include "test_util/testharness.h"
 #include "util/hash.h"
 #include "util/random.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc
index 32870b127d2..55d3cd7a658 100644
--- a/memtable/skiplistrep.cc
+++ b/memtable/skiplistrep.cc
@@ -3,10 +3,10 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "memtable/inlineskiplist.h"
 #include "db/memtable.h"
+#include "memory/arena.h"
+#include "memtable/inlineskiplist.h"
 #include "rocksdb/memtablerep.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 namespace {
@@ -50,6 +50,15 @@ class SkipListRep : public MemTableRep {
    return skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
  }
 
+ void InsertWithHintConcurrently(KeyHandle handle, void** hint) override {
+   skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle), hint);
+ }
+
+ bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override {
+   return skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle),
+                                                hint);
+ }
+
  void InsertConcurrently(KeyHandle handle) override {
    skip_list_.InsertConcurrently(static_cast<char*>(handle));
  }
diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc
index 827ab8a5d2b..e7acc94ad67 100644
--- a/memtable/vectorrep.cc
+++ b/memtable/vectorrep.cc
@@ -12,8 +12,8 @@
 #include <algorithm>
 #include <type_traits>
 
-#include "util/arena.h"
 #include "db/memtable.h"
+#include "memory/arena.h"
 #include "memtable/stl_wrappers.h"
 #include "port/port.h"
 #include "util/mutexlock.h"
diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc
index 3c89c8095e1..23de06a623a 100644
--- a/memtable/write_buffer_manager_test.cc
+++ b/memtable/write_buffer_manager_test.cc
@@ -8,7 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/write_buffer_manager.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
@@ -51,8 +51,12 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) {
 }
 
 TEST_F(WriteBufferManagerTest, CacheCost) {
+  LRUCacheOptions co;
   // 1GB cache
-  std::shared_ptr<Cache> cache = NewLRUCache(1024 * 1024 * 1024, 4);
+  co.capacity = 1024 * 1024 * 1024;
+  co.num_shard_bits = 4;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
   // A write buffer manager of size 50MB
   std::unique_ptr<WriteBufferManager> wbf(
       new WriteBufferManager(50 * 1024 * 1024, cache));
diff --git a/monitoring/histogram.cc b/monitoring/histogram.cc
index 4bc7139d304..4449ade6408 100644
--- a/monitoring/histogram.cc
+++ b/monitoring/histogram.cc
@@ -7,16 +7,12 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "monitoring/histogram.h"
 
-#include <inttypes.h>
-#include <cassert>
 #include <math.h>
 #include <stdio.h>
+#include <cassert>
+#include <cinttypes>
 
 #include "port/port.h"
 #include "util/cast_util.h"
diff --git a/monitoring/histogram_test.cc b/monitoring/histogram_test.cc
index df58822fc21..ed9a7bd32ff 100644
--- a/monitoring/histogram_test.cc
+++ b/monitoring/histogram_test.cc
@@ -7,7 +7,7 @@
 
 #include "monitoring/histogram.h"
 #include "monitoring/histogram_windowing.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/db/in_memory_stats_history.cc b/monitoring/in_memory_stats_history.cc
similarity index 82%
rename from db/in_memory_stats_history.cc
rename to monitoring/in_memory_stats_history.cc
index 39355cfbe0a..22ecde0ab6c 100644
--- a/db/in_memory_stats_history.cc
+++ b/monitoring/in_memory_stats_history.cc
@@ -6,8 +6,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "db/db_impl.h"
-#include "db/in_memory_stats_history.h"
+#include "monitoring/in_memory_stats_history.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
@@ -17,6 +17,10 @@ bool InMemoryStatsHistoryIterator::Valid() const { return valid_; }
 
 Status InMemoryStatsHistoryIterator::status() const { return status_; }
 
+// Because of garbage collection, the next stats snapshot may or may not be
+// right after the current one. When reading from DBImpl::stats_history_, this
+// call will be protected by DB Mutex so it will not return partial or
+// corrupted results.
 void InMemoryStatsHistoryIterator::Next() {
   // increment start_time by 1 to avoid infinite loop
   AdvanceIteratorByTime(GetStatsTime() + 1, end_time_);
diff --git a/db/in_memory_stats_history.h b/monitoring/in_memory_stats_history.h
similarity index 63%
rename from db/in_memory_stats_history.h
rename to monitoring/in_memory_stats_history.h
index 4b52e23fffa..37b50ca06de 100644
--- a/db/in_memory_stats_history.h
+++ b/monitoring/in_memory_stats_history.h
@@ -12,8 +12,20 @@
 
 namespace rocksdb {
 
+// InMemoryStatsHistoryIterator can be used to access stats history that was
+// stored by an in-memory two level std::map(DBImpl::stats_history_). It keeps
+// a copy of the stats snapshot (in stats_map_) that is currently being pointed
+// to, which allows the iterator to access the stats snapshot even when
+// the background garbage collecting thread purges it from the source of truth
+// (`DBImpl::stats_history_`). In that case, the iterator will continue to be
+// valid until a call to `Next()` returns no result and invalidates it. In
+// some extreme cases, the iterator may also return fragmented segments of
+// stats snapshots due to long gaps between `Next()` calls and interleaved
+// garbage collection.
 class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
  public:
+  // Setup InMemoryStatsHistoryIterator to return stats snapshots between
+  // seconds timestamps [start_time, end_time)
   InMemoryStatsHistoryIterator(uint64_t start_time, uint64_t end_time,
                                DBImpl* db_impl)
       : start_time_(start_time),
@@ -22,13 +34,27 @@ class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
         db_impl_(db_impl) {
     AdvanceIteratorByTime(start_time_, end_time_);
   }
+  // no copying allowed
+  InMemoryStatsHistoryIterator(const InMemoryStatsHistoryIterator&) = delete;
+  void operator=(const InMemoryStatsHistoryIterator&) = delete;
+  InMemoryStatsHistoryIterator(InMemoryStatsHistoryIterator&&) = delete;
+  InMemoryStatsHistoryIterator& operator=(InMemoryStatsHistoryIterator&&) =
+      delete;
+
   ~InMemoryStatsHistoryIterator() override;
   bool Valid() const override;
   Status status() const override;
 
+  // Move to the next stats snapshot currently available
+  // This function may invalidate the iterator
+  // REQUIRES: Valid()
   void Next() override;
+
+  // REQUIRES: Valid()
   uint64_t GetStatsTime() const override;
 
+  // This function is idempotent
+  // REQUIRES: Valid()
   const std::map<std::string, uint64_t>& GetStatsMap() const override;
 
  private:
@@ -36,13 +62,6 @@ class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
   // between [start_time, end_time)
   void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time);
 
-  // No copying allowed
-  InMemoryStatsHistoryIterator(const InMemoryStatsHistoryIterator&) = delete;
-  void operator=(const InMemoryStatsHistoryIterator&) = delete;
-  InMemoryStatsHistoryIterator(InMemoryStatsHistoryIterator&&) = delete;
-  InMemoryStatsHistoryIterator& operator=(InMemoryStatsHistoryIterator&&) =
-      delete;
-
   uint64_t time_;
   uint64_t start_time_;
   uint64_t end_time_;
diff --git a/monitoring/instrumented_mutex.cc b/monitoring/instrumented_mutex.cc
index 7b61bcf4fb8..796bb26dd4b 100644
--- a/monitoring/instrumented_mutex.cc
+++ b/monitoring/instrumented_mutex.cc
@@ -6,7 +6,7 @@
 #include "monitoring/instrumented_mutex.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 namespace {
diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc
index 3d102f91203..20e0467ab88 100644
--- a/monitoring/iostats_context.cc
+++ b/monitoring/iostats_context.cc
@@ -54,7 +54,9 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const {
   IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos);
   IOSTATS_CONTEXT_OUTPUT(logger_nanos);
 
-  return ss.str();
+  std::string str = ss.str();
+  str.erase(str.find_last_not_of(", ") + 1);
+  return str;
 }
 
 }  // namespace rocksdb
diff --git a/monitoring/iostats_context_test.cc b/monitoring/iostats_context_test.cc
index 74d3e43291d..28d305d021a 100644
--- a/monitoring/iostats_context_test.cc
+++ b/monitoring/iostats_context_test.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "rocksdb/iostats_context.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc
index 40b0b215c47..5e0d5ac2544 100644
--- a/monitoring/perf_context.cc
+++ b/monitoring/perf_context.cc
@@ -529,7 +529,10 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
   PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_true_positive);
   PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_hit_count);
   PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_miss_count);
-  return ss.str();
+
+  std::string str = ss.str();
+  str.erase(str.find_last_not_of(", ") + 1);
+  return str;
 #endif
 }
 
diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h
index e0ff8afc58e..7bf62060557 100644
--- a/monitoring/perf_context_imp.h
+++ b/monitoring/perf_context_imp.h
@@ -22,12 +22,16 @@ extern thread_local PerfContext perf_context;
 
 #if defined(NPERF_CONTEXT)
 
-#define PERF_TIMER_GUARD(metric)
-#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition)
-#define PERF_TIMER_MEASURE(metric)
 #define PERF_TIMER_STOP(metric)
 #define PERF_TIMER_START(metric)
+#define PERF_TIMER_GUARD(metric)
+#define PERF_TIMER_GUARD_WITH_ENV(metric, env)
+#define PERF_CPU_TIMER_GUARD(metric, env)
+#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
+                                               ticker_type)
+#define PERF_TIMER_MEASURE(metric)
 #define PERF_COUNTER_ADD(metric, value)
+#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)
 
 #else
 
diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc
new file mode 100644
index 00000000000..74f7303648f
--- /dev/null
+++ b/monitoring/persistent_stats_history.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "monitoring/persistent_stats_history.h"
+
+#include <cstring>
+#include <string>
+#include <utility>
+#include "db/db_impl/db_impl.h"
+#include "port/likely.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+// 10 digit seconds timestamp => [Sep 9, 2001 ~ Nov 20, 2286]
+const int kNowSecondsStringLength = 10;
+const std::string kFormatVersionKeyString =
+    "__persistent_stats_format_version__";
+const std::string kCompatibleVersionKeyString =
+    "__persistent_stats_compatible_version__";
+// Every release maintains two versions numbers for persistents stats: Current
+// format version and compatible format version. Current format version
+// designates what type of encoding will be used when writing to stats CF;
+// compatible format version designates the minimum format version that
+// can decode the stats CF encoded using the current format version.
+const uint64_t kStatsCFCurrentFormatVersion = 1;
+const uint64_t kStatsCFCompatibleFormatVersion = 1;
+
+Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type,
+                                          uint64_t* version_number) {
+  if (type >= StatsVersionKeyType::kKeyTypeMax) {
+    return Status::InvalidArgument("Invalid stats version key type provided");
+  }
+  std::string key;
+  if (type == StatsVersionKeyType::kFormatVersion) {
+    key = kFormatVersionKeyString;
+  } else if (type == StatsVersionKeyType::kCompatibleVersion) {
+    key = kCompatibleVersionKeyString;
+  }
+  ReadOptions options;
+  options.verify_checksums = true;
+  std::string result;
+  Status s = db->Get(options, db->PersistentStatsColumnFamily(), key, &result);
+  if (!s.ok() || result.empty()) {
+    return Status::NotFound("Persistent stats version key " + key +
+                            " not found.");
+  }
+
+  // read version_number but do nothing in current version
+  *version_number = ParseUint64(result);
+  return Status::OK();
+}
+
+int EncodePersistentStatsKey(uint64_t now_seconds, const std::string& key,
+                             int size, char* buf) {
+  char timestamp[kNowSecondsStringLength + 1];
+  // make time stamp string equal in length to allow sorting by time
+  snprintf(timestamp, sizeof(timestamp), "%010d",
+           static_cast<int>(now_seconds));
+  timestamp[kNowSecondsStringLength] = '\0';
+  return snprintf(buf, size, "%s#%s", timestamp, key.c_str());
+}
+
+void OptimizeForPersistentStats(ColumnFamilyOptions* cfo) {
+  cfo->write_buffer_size = 2 << 20;
+  cfo->target_file_size_base = 2 * 1048576;
+  cfo->max_bytes_for_level_base = 10 * 1048576;
+  cfo->soft_pending_compaction_bytes_limit = 256 * 1048576;
+  cfo->hard_pending_compaction_bytes_limit = 1073741824ul;
+  cfo->compression = kNoCompression;
+}
+
+PersistentStatsHistoryIterator::~PersistentStatsHistoryIterator() {}
+
+bool PersistentStatsHistoryIterator::Valid() const { return valid_; }
+
+Status PersistentStatsHistoryIterator::status() const { return status_; }
+
+void PersistentStatsHistoryIterator::Next() {
+  // increment start_time by 1 to avoid infinite loop
+  AdvanceIteratorByTime(GetStatsTime() + 1, end_time_);
+}
+
+uint64_t PersistentStatsHistoryIterator::GetStatsTime() const { return time_; }
+
+const std::map<std::string, uint64_t>&
+PersistentStatsHistoryIterator::GetStatsMap() const {
+  return stats_map_;
+}
+
+std::pair<uint64_t, std::string> parseKey(const Slice& key,
+                                          uint64_t start_time) {
+  std::pair<uint64_t, std::string> result;
+  std::string key_str = key.ToString();
+  std::string::size_type pos = key_str.find("#");
+  // TODO(Zhongyi): add counters to track parse failures?
+  if (pos == std::string::npos) {
+    result.first = port::kMaxUint64;
+    result.second.clear();
+  } else {
+    uint64_t parsed_time = ParseUint64(key_str.substr(0, pos));
+    // skip entries with timestamp smaller than start_time
+    if (parsed_time < start_time) {
+      result.first = port::kMaxUint64;
+      result.second = "";
+    } else {
+      result.first = parsed_time;
+      std::string key_resize = key_str.substr(pos + 1);
+      result.second = key_resize;
+    }
+  }
+  return result;
+}
+
+// advance the iterator to the next time between [start_time, end_time)
+// if success, update time_ and stats_map_ with new_time and stats_map
+void PersistentStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time,
+                                                           uint64_t end_time) {
+  // try to find next entry in stats_history_ map
+  if (db_impl_ != nullptr) {
+    ReadOptions ro;
+    Iterator* iter =
+        db_impl_->NewIterator(ro, db_impl_->PersistentStatsColumnFamily());
+
+    char timestamp[kNowSecondsStringLength + 1];
+    snprintf(timestamp, sizeof(timestamp), "%010d",
+             static_cast<int>(std::max(time_, start_time)));
+    timestamp[kNowSecondsStringLength] = '\0';
+
+    iter->Seek(timestamp);
+    // no more entries with timestamp >= start_time is found or version key
+    // is found to be incompatible
+    if (!iter->Valid()) {
+      valid_ = false;
+      delete iter;
+      return;
+    }
+    time_ = parseKey(iter->key(), start_time).first;
+    valid_ = true;
+    // check parsed time and invalid if it exceeds end_time
+    if (time_ > end_time) {
+      valid_ = false;
+      delete iter;
+      return;
+    }
+    // find all entries with timestamp equal to time_
+    std::map<std::string, uint64_t> new_stats_map;
+    std::pair<uint64_t, std::string> kv;
+    for (; iter->Valid(); iter->Next()) {
+      kv = parseKey(iter->key(), start_time);
+      if (kv.first != time_) {
+        break;
+      }
+      if (kv.second.compare(kFormatVersionKeyString) == 0) {
+        continue;
+      }
+      new_stats_map[kv.second] = ParseUint64(iter->value().ToString());
+    }
+    stats_map_.swap(new_stats_map);
+    delete iter;
+  } else {
+    valid_ = false;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/monitoring/persistent_stats_history.h b/monitoring/persistent_stats_history.h
new file mode 100644
index 00000000000..9a6885987fd
--- /dev/null
+++ b/monitoring/persistent_stats_history.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/stats_history.h"
+
+namespace rocksdb {
+
+extern const std::string kFormatVersionKeyString;
+extern const std::string kCompatibleVersionKeyString;
+extern const uint64_t kStatsCFCurrentFormatVersion;
+extern const uint64_t kStatsCFCompatibleFormatVersion;
+
+enum StatsVersionKeyType : uint32_t {
+  kFormatVersion = 1,
+  kCompatibleVersion = 2,
+  kKeyTypeMax = 3
+};
+
+// Read the version number from persitent stats cf depending on type provided
+// stores the version number in `*version_number`
+// returns Status::OK() on success, or other status code on failure
+Status DecodePersistentStatsVersionNumber(DBImpl* db, StatsVersionKeyType type,
+                                          uint64_t* version_number);
+
+// Encode timestamp and stats key into buf
+// Format: timestamp(10 digit) + '#' + key
+// Total length of encoded key will be capped at 100 bytes
+int EncodePersistentStatsKey(uint64_t timestamp, const std::string& key,
+                             int size, char* buf);
+
+void OptimizeForPersistentStats(ColumnFamilyOptions* cfo);
+
+class PersistentStatsHistoryIterator final : public StatsHistoryIterator {
+ public:
+  PersistentStatsHistoryIterator(uint64_t start_time, uint64_t end_time,
+                                 DBImpl* db_impl)
+      : time_(0),
+        start_time_(start_time),
+        end_time_(end_time),
+        valid_(true),
+        db_impl_(db_impl) {
+    AdvanceIteratorByTime(start_time_, end_time_);
+  }
+  ~PersistentStatsHistoryIterator() override;
+  bool Valid() const override;
+  Status status() const override;
+
+  void Next() override;
+  uint64_t GetStatsTime() const override;
+
+  const std::map<std::string, uint64_t>& GetStatsMap() const override;
+
+ private:
+  // advance the iterator to the next stats history record with timestamp
+  // between [start_time, end_time)
+  void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time);
+
+  // No copying allowed
+  PersistentStatsHistoryIterator(const PersistentStatsHistoryIterator&) =
+      delete;
+  void operator=(const PersistentStatsHistoryIterator&) = delete;
+  PersistentStatsHistoryIterator(PersistentStatsHistoryIterator&&) = delete;
+  PersistentStatsHistoryIterator& operator=(PersistentStatsHistoryIterator&&) =
+      delete;
+
+  uint64_t time_;
+  uint64_t start_time_;
+  uint64_t end_time_;
+  std::map<std::string, uint64_t> stats_map_;
+  Status status_;
+  bool valid_;
+  DBImpl* db_impl_;
+};
+
+}  // namespace rocksdb
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index fe2f2e25af3..6942fc579f8 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -5,15 +5,11 @@
 //
 #include "monitoring/statistics.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
-#include "rocksdb/statistics.h"
-#include "port/likely.h"
 #include <algorithm>
+#include <cinttypes>
 #include <cstdio>
+#include "port/likely.h"
+#include "rocksdb/statistics.h"
 
 namespace rocksdb {
 
@@ -166,6 +162,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
      "rocksdb.txn.overhead.mutex.old.commit.map"},
     {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
     {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
+    {TXN_GET_TRY_AGAIN, "rocksdb.txn.get.tryagain"},
     {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"},
     {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"},
     {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"},
diff --git a/monitoring/statistics_test.cc b/monitoring/statistics_test.cc
index a77022bfb3d..162afb264b2 100644
--- a/monitoring/statistics_test.cc
+++ b/monitoring/statistics_test.cc
@@ -5,8 +5,8 @@
 //
 
 #include "port/stack_trace.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 #include "rocksdb/statistics.h"
 
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
new file mode 100644
index 00000000000..f1ec3391132
--- /dev/null
+++ b/monitoring/stats_history_test.cc
@@ -0,0 +1,653 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/stats_history.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace rocksdb {
+
+class StatsHistoryTest : public DBTestBase {
+ public:
+  StatsHistoryTest() : DBTestBase("/stats_history_test") {}
+};
+#ifndef ROCKSDB_LITE
+
+TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_dump_period_sec = 5;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX && !NDEBUG
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DumpStats:1", [&](void* /*arg*/) { counter++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+  dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+
+  // Test cacel job through SetOptions
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}));
+  int old_val = counter;
+  for (int i = 6; i < 20; ++i) {
+    dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); });
+  }
+  ASSERT_EQ(counter, old_val);
+  Close();
+}
+
+// Test persistent stats background thread scheduling and cancelling
+TEST_F(StatsHistoryTest, StatsPersistScheduling) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX && !NDEBUG
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+
+  // Test cacel job through SetOptions
+  ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled());
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
+  ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled());
+  Close();
+}
+
+// Test enabling persistent stats for the first time
+TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 0;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX && !NDEBUG
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}}));
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+  Close();
+}
+
+// TODO(Zhongyi): Move persistent stats related tests to a separate file
+TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+#endif  // OS_MACOSX && !NDEBUG
+
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  int mock_time = 1;
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 6 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  // disabled stats snapshots
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
+  size_t stats_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), 5);
+    stats_count += stats_map.size();
+  }
+  ASSERT_GT(stats_count, 0);
+  // Wait a bit and verify no more stats are found
+  for (mock_time = 6; mock_time < 20; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+  db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_new = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    stats_count_new += stats_iter->GetStatsMap().size();
+  }
+  ASSERT_EQ(stats_count_new, stats_count);
+  Close();
+}
+
+TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.stats_persist_period_sec = 1;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+#endif  // OS_MACOSX && !NDEBUG
+
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // some random operation to populate statistics
+  ASSERT_OK(Delete("foo"));
+  ASSERT_OK(Put("sol", "sol"));
+  ASSERT_OK(Put("epic", "epic"));
+  ASSERT_OK(Put("ltd", "ltd"));
+  ASSERT_EQ("sol", Get("sol"));
+  ASSERT_EQ("epic", Get("epic"));
+  ASSERT_EQ("ltd", Get("ltd"));
+  Iterator* iterator = db_->NewIterator(ReadOptions());
+  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+    ASSERT_TRUE(iterator->key() == iterator->value());
+  }
+  delete iterator;
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("sol"));
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  int mock_time = 1;
+  // Wait for stats persist to finish
+  for (; mock_time < 5; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+
+  // second round of ops
+  ASSERT_OK(Put("saigon", "saigon"));
+  ASSERT_OK(Put("noodle talk", "noodle talk"));
+  ASSERT_OK(Put("ping bistro", "ping bistro"));
+  iterator = db_->NewIterator(ReadOptions());
+  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+    ASSERT_TRUE(iterator->key() == iterator->value());
+  }
+  delete iterator;
+  ASSERT_OK(Flush());
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  for (; mock_time < 10; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 10 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count = 0;
+  int slice_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    stats_count += stats_map.size();
+  }
+  size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize();
+  ASSERT_GE(slice_count, 9);
+  ASSERT_GE(stats_history_size, 12000);
+  // capping memory cost at 12000 bytes since one slice is around 10000~12000
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}}));
+  ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size);
+  // Wait for stats persist to finish
+  for (; mock_time < 20; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+  db_->GetStatsHistory(0 /*start_time*/, 20 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_reopen = 0;
+  slice_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    stats_count_reopen += stats_map.size();
+  }
+  size_t stats_history_size_reopen =
+      dbfull()->TEST_EstimateInMemoryStatsHistorySize();
+  // only one slice can fit under the new stats_history_buffer_size
+  ASSERT_LT(slice_count, 2);
+  ASSERT_TRUE(stats_history_size_reopen < 12000 &&
+              stats_history_size_reopen > 0);
+  ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0);
+  Close();
+  // TODO: may also want to verify stats timestamp to make sure we are purging
+  // the correct stats snapshot
+}
+
+int countkeys(Iterator* iter) {
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    count++;
+  }
+  return count;
+}
+
+TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(Get("foo"), "bar");
+
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  auto iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count1 = countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(10); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count2 = countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(15); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count3 = countkeys(iter);
+  delete iter;
+  ASSERT_GE(key_count2, key_count1);
+  ASSERT_GE(key_count3, key_count2);
+  ASSERT_EQ(key_count3 - key_count2, key_count2 - key_count1);
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count = 0;
+  int slice_count = 0;
+  int non_zero_count = 0;
+  for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i);
+    for (auto& stat : stats_map) {
+      if (stat.second != 0) {
+        non_zero_count++;
+      }
+    }
+    stats_count += stats_map.size();
+  }
+  ASSERT_EQ(slice_count, 3);
+  // 2 extra keys for format version
+  ASSERT_EQ(stats_count, key_count3 - 2);
+  // verify reopen will not cause data loss
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  db_->GetStatsHistory(0 /*start_time*/, 16 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_reopen = 0;
+  int slice_count_reopen = 0;
+  int non_zero_count_recover = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count_reopen++;
+    auto stats_map = stats_iter->GetStatsMap();
+    for (auto& stat : stats_map) {
+      if (stat.second != 0) {
+        non_zero_count_recover++;
+      }
+    }
+    stats_count_reopen += stats_map.size();
+  }
+  ASSERT_EQ(non_zero_count, non_zero_count_recover);
+  ASSERT_EQ(slice_count, slice_count_reopen);
+  ASSERT_EQ(stats_count, stats_count_reopen);
+  Close();
+}
+
+// Test persisted stats matches the value found in options.statistics and
+// the stats value retains after DB reopen
+TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  std::map<std::string, uint64_t> stats_map_before;
+  ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_before));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(Get("foo"), "bar");
+
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  auto iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(10); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(15); });
+  iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  countkeys(iter);
+  delete iter;
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(20); });
+
+  std::map<std::string, uint64_t> stats_map_after;
+  ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_after));
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  std::string sample = "rocksdb.num.iterator.deleted";
+  uint64_t recovered_value = 0;
+  for (int i = 1; stats_iter->Valid(); stats_iter->Next(), ++i) {
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i);
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        recovered_value += stat.second;
+      }
+    }
+  }
+  ASSERT_EQ(recovered_value, stats_map_after[sample]);
+
+  // test stats value retains after recovery
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  db_->GetStatsHistory(0 /*start_time*/, 21 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  uint64_t new_recovered_value = 0;
+  for (int i = 1; stats_iter->Valid(); stats_iter->Next(), i++) {
+    auto stats_map = stats_iter->GetStatsMap();
+    ASSERT_EQ(stats_iter->GetStatsTime(), 5 * i);
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        new_recovered_value += stat.second;
+      }
+    }
+  }
+  ASSERT_EQ(recovered_value, new_recovered_value);
+
+  // TODO(Zhongyi): also add test to read raw values from disk and verify
+  // correctness
+  Close();
+}
+
+// TODO(Zhongyi): add test for different format versions
+
+TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  ASSERT_OK(TryReopen(options));
+  CreateColumnFamilies({"one", "two", "three"}, options);
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+  ASSERT_EQ(Get(2, "foo"), "bar");
+  CreateColumnFamilies({"four"}, options);
+  ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
+  ASSERT_EQ(Get(2, "foo"), "bar");
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  auto iter =
+      db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
+  int key_count = countkeys(iter);
+  delete iter;
+  ASSERT_GE(key_count, 0);
+  uint64_t num_write_wal = 0;
+  std::string sample = "rocksdb.write.wal";
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        num_write_wal += stat.second;
+      }
+    }
+  }
+  stats_iter.reset();
+  ASSERT_EQ(num_write_wal, 2);
+
+  options.persist_stats_to_disk = false;
+  ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
+  int cf_count = 0;
+  for (auto cfd : *dbfull()->versions_->GetColumnFamilySet()) {
+    (void)cfd;
+    cf_count++;
+  }
+  // persistent stats cf will be implicitly opened even if
+  // persist_stats_to_disk is false
+  ASSERT_EQ(cf_count, 6);
+  ASSERT_EQ(Get(2, "foo"), "bar");
+
+  // attempt to create column family using same name, should fail
+  ColumnFamilyOptions cf_opts(options);
+  ColumnFamilyHandle* handle;
+  ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName,
+                                     &handle));
+
+  options.persist_stats_to_disk = true;
+  ReopenWithColumnFamilies({"default", "one", "two", "three", "four"}, options);
+  ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName,
+                                     &handle));
+  // verify stats is not affected by prior failed CF creation
+  db_->GetStatsHistory(0 /*start_time*/, 5 /*end_time*/, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  num_write_wal = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    for (const auto& stat : stats_map) {
+      if (sample.compare(stat.first) == 0) {
+        num_write_wal += stat.second;
+      }
+    }
+  }
+  ASSERT_EQ(num_write_wal, 2);
+
+  Close();
+  Destroy(options);
+}
+
+TEST_F(StatsHistoryTest, PersistentStatsReadOnly) {
+  ASSERT_OK(Put("bar", "v2"));
+  Close();
+
+  auto options = CurrentOptions();
+  options.stats_persist_period_sec = 5;
+  options.persist_stats_to_disk = true;
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v2", Get("bar"));
+  Close();
+
+  // Reopen and flush memtable.
+  ASSERT_OK(TryReopen(options));
+  Flush();
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+}
+
+TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 1024 * 1024 * 10;  // 10 Mb
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.persist_stats_to_disk = true;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  CreateColumnFamilies({"pikachu"}, options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ColumnFamilyData* cfd_default =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+          ->cfd();
+  ColumnFamilyData* cfd_stats = static_cast<ColumnFamilyHandleImpl*>(
+                                    dbfull()->PersistentStatsColumnFamily())
+                                    ->cfd();
+  ColumnFamilyData* cfd_test =
+      static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+
+  ASSERT_OK(Put("foo", "v0"));
+  ASSERT_OK(Put("bar", "v0"));
+  ASSERT_EQ("v0", Get("bar"));
+  ASSERT_EQ("v0", Get("foo"));
+  ASSERT_OK(Put(1, "Eevee", "v0"));
+  ASSERT_EQ("v0", Get(1, "Eevee"));
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  // writing to all three cf, flush default cf
+  // LogNumbers: default: 14, stats: 4, pikachu: 4
+  ASSERT_OK(Flush());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_LT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+
+  ASSERT_OK(Put("foo1", "v1"));
+  ASSERT_OK(Put("bar1", "v1"));
+  ASSERT_EQ("v1", Get("bar1"));
+  ASSERT_EQ("v1", Get("foo1"));
+  ASSERT_OK(Put(1, "Vaporeon", "v1"));
+  ASSERT_EQ("v1", Get(1, "Vaporeon"));
+  // writing to default and test cf, flush test cf
+  // LogNumbers: default: 14, stats: 16, pikachu: 16
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_GT(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+
+  ASSERT_OK(Put("foo2", "v2"));
+  ASSERT_OK(Put("bar2", "v2"));
+  ASSERT_EQ("v2", Get("bar2"));
+  ASSERT_EQ("v2", Get("foo2"));
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(10); });
+  // writing to default and stats cf, flushing default cf
+  // LogNumbers: default: 19, stats: 19, pikachu: 19
+  ASSERT_OK(Flush());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+
+  ASSERT_OK(Put("foo3", "v3"));
+  ASSERT_OK(Put("bar3", "v3"));
+  ASSERT_EQ("v3", Get("bar3"));
+  ASSERT_EQ("v3", Get("foo3"));
+  ASSERT_OK(Put(1, "Jolteon", "v3"));
+  ASSERT_EQ("v3", Get(1, "Jolteon"));
+  dbfull()->TEST_WaitForPersistStatsRun(
+      [&] { mock_env->set_current_time(15); });
+  // writing to all three cf, flushing test cf
+  // LogNumbers: default: 19, stats: 19, pikachu: 22
+  ASSERT_OK(Flush(1));
+  ASSERT_LT(cfd_stats->GetLogNumber(), cfd_test->GetLogNumber());
+  ASSERT_EQ(cfd_stats->GetLogNumber(), cfd_default->GetLogNumber());
+  Close();
+}
+
+#endif  // !ROCKSDB_LITE
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/options/cf_options.cc b/options/cf_options.cc
index f7af3f834c9..ef06eaf15bb 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -5,19 +5,15 @@
 
 #include "options/cf_options.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <cassert>
+#include <cinttypes>
 #include <limits>
 #include <string>
 #include "options/db_options.h"
 #include "port/port.h"
+#include "rocksdb/concurrent_task_limiter.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "rocksdb/concurrent_task_limiter.h"
 
 namespace rocksdb {
 
@@ -37,6 +33,8 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
           cf_options.min_write_buffer_number_to_merge),
       max_write_buffer_number_to_maintain(
           cf_options.max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain(
+          cf_options.max_write_buffer_size_to_maintain),
       inplace_update_support(cf_options.inplace_update_support),
       inplace_callback(cf_options.inplace_callback),
       info_log(db_options.info_log.get()),
@@ -169,8 +167,6 @@ void MutableCFOptions::Dump(Logger* log) const {
                  target_file_size_multiplier);
   ROCKS_LOG_INFO(log, "                 max_bytes_for_level_base: %" PRIu64,
                  max_bytes_for_level_base);
-  ROCKS_LOG_INFO(log, "                       snap_refresh_nanos: %" PRIu64,
-                 snap_refresh_nanos);
   ROCKS_LOG_INFO(log, "           max_bytes_for_level_multiplier: %f",
                  max_bytes_for_level_multiplier);
   ROCKS_LOG_INFO(log, "                                      ttl: %" PRIu64,
diff --git a/options/cf_options.h b/options/cf_options.h
index 47fca58fa7d..3a6be638167 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -20,7 +20,6 @@ namespace rocksdb {
 // of DB. Raw pointers defined in this struct do not have ownership to the data
 // they point to. Options contains std::shared_ptr to these data.
 struct ImmutableCFOptions {
-  ImmutableCFOptions();
   explicit ImmutableCFOptions(const Options& options);
 
   ImmutableCFOptions(const ImmutableDBOptions& db_options,
@@ -43,6 +42,8 @@ struct ImmutableCFOptions {
 
   int max_write_buffer_number_to_maintain;
 
+  int64_t max_write_buffer_size_to_maintain;
+
   bool inplace_update_support;
 
   UpdateStatus (*inplace_callback)(char* existing_value,
@@ -149,7 +150,6 @@ struct MutableCFOptions {
         target_file_size_base(options.target_file_size_base),
         target_file_size_multiplier(options.target_file_size_multiplier),
         max_bytes_for_level_base(options.max_bytes_for_level_base),
-        snap_refresh_nanos(options.snap_refresh_nanos),
         max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
         ttl(options.ttl),
         periodic_compaction_seconds(options.periodic_compaction_seconds),
@@ -186,7 +186,6 @@ struct MutableCFOptions {
         target_file_size_base(0),
         target_file_size_multiplier(0),
         max_bytes_for_level_base(0),
-        snap_refresh_nanos(0),
         max_bytes_for_level_multiplier(0),
         ttl(0),
         periodic_compaction_seconds(0),
@@ -238,7 +237,6 @@ struct MutableCFOptions {
   uint64_t target_file_size_base;
   int target_file_size_multiplier;
   uint64_t max_bytes_for_level_base;
-  uint64_t snap_refresh_nanos;
   double max_bytes_for_level_multiplier;
   uint64_t ttl;
   uint64_t periodic_compaction_seconds;
diff --git a/options/db_options.cc b/options/db_options.cc
index 83f1a18b042..ca2800d0784 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -5,18 +5,14 @@
 
 #include "options/db_options.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
+#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
 #include "rocksdb/wal_filter.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
@@ -48,6 +44,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       table_cache_numshardbits(options.table_cache_numshardbits),
       wal_ttl_seconds(options.WAL_ttl_seconds),
       wal_size_limit_mb(options.WAL_size_limit_MB),
+      max_write_batch_group_size_bytes(
+          options.max_write_batch_group_size_bytes),
       manifest_preallocation_size(options.manifest_preallocation_size),
       allow_mmap_reads(options.allow_mmap_reads),
       allow_mmap_writes(options.allow_mmap_writes),
@@ -67,6 +65,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       listeners(options.listeners),
       enable_thread_tracking(options.enable_thread_tracking),
       enable_pipelined_write(options.enable_pipelined_write),
+      unordered_write(options.unordered_write),
       allow_concurrent_memtable_write(options.allow_concurrent_memtable_write),
       enable_write_thread_adaptive_yield(
           options.enable_write_thread_adaptive_yield),
@@ -87,7 +86,10 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       two_write_queues(options.two_write_queues),
       manual_wal_flush(options.manual_wal_flush),
       atomic_flush(options.atomic_flush),
-      avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io) {
+      avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io),
+      persist_stats_to_disk(options.persist_stats_to_disk),
+      write_dbid_to_manifest(options.write_dbid_to_manifest),
+      log_readahead_size(options.log_readahead_size) {
 }
 
 void ImmutableDBOptions::Dump(Logger* log) const {
@@ -153,6 +155,10 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log,
                    "                      Options.WAL_size_limit_MB: %" PRIu64,
                    wal_size_limit_mb);
+  ROCKS_LOG_HEADER(log,
+                   "                       "
+                   "Options.max_write_batch_group_size_bytes: %" PRIu64,
+                   max_write_batch_group_size_bytes);
   ROCKS_LOG_HEADER(
       log, "            Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
       manifest_preallocation_size);
@@ -185,6 +191,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    enable_thread_tracking);
   ROCKS_LOG_HEADER(log, "                 Options.enable_pipelined_write: %d",
                    enable_pipelined_write);
+  ROCKS_LOG_HEADER(log, "                 Options.unordered_write: %d",
+                   unordered_write);
   ROCKS_LOG_HEADER(log, "        Options.allow_concurrent_memtable_write: %d",
                    allow_concurrent_memtable_write);
   ROCKS_LOG_HEADER(log, "     Options.enable_write_thread_adaptive_yield: %d",
@@ -223,6 +231,13 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log,
                    "            Options.avoid_unnecessary_blocking_io: %d",
                    avoid_unnecessary_blocking_io);
+  ROCKS_LOG_HEADER(log, "                Options.persist_stats_to_disk: %u",
+                   persist_stats_to_disk);
+  ROCKS_LOG_HEADER(log, "                Options.write_dbid_to_manifest: %d",
+                   write_dbid_to_manifest);
+  ROCKS_LOG_HEADER(
+      log, "                Options.log_readahead_size: %" ROCKSDB_PRIszt,
+      log_readahead_size);
 }
 
 MutableDBOptions::MutableDBOptions()
diff --git a/options/db_options.h b/options/db_options.h
index 8d02003623e..7c71b12a0cc 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -43,6 +43,7 @@ struct ImmutableDBOptions {
   int table_cache_numshardbits;
   uint64_t wal_ttl_seconds;
   uint64_t wal_size_limit_mb;
+  uint64_t max_write_batch_group_size_bytes;
   size_t manifest_preallocation_size;
   bool allow_mmap_reads;
   bool allow_mmap_writes;
@@ -60,6 +61,7 @@ struct ImmutableDBOptions {
   std::vector<std::shared_ptr<EventListener>> listeners;
   bool enable_thread_tracking;
   bool enable_pipelined_write;
+  bool unordered_write;
   bool allow_concurrent_memtable_write;
   bool enable_write_thread_adaptive_yield;
   uint64_t write_thread_max_yield_usec;
@@ -80,6 +82,9 @@ struct ImmutableDBOptions {
   bool manual_wal_flush;
   bool atomic_flush;
   bool avoid_unnecessary_blocking_io;
+  bool persist_stats_to_disk;
+  bool write_dbid_to_manifest;
+  size_t log_readahead_size;
 };
 
 struct MutableDBOptions {
diff --git a/options/options.cc b/options/options.cc
index 900510d01b6..db1cd7130bf 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -9,11 +9,7 @@
 
 #include "rocksdb/options.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <limits>
 
 #include "monitoring/statistics.h"
@@ -31,7 +27,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/wal_filter.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "util/compression.h"
 
 namespace rocksdb {
@@ -46,6 +42,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
           options.min_write_buffer_number_to_merge),
       max_write_buffer_number_to_maintain(
           options.max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain(
+          options.max_write_buffer_size_to_maintain),
       inplace_update_support(options.inplace_update_support),
       inplace_update_num_locks(options.inplace_update_num_locks),
       inplace_callback(options.inplace_callback),
@@ -162,6 +160,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                      min_write_buffer_number_to_merge);
     ROCKS_LOG_HEADER(log, "    Options.max_write_buffer_number_to_maintain: %d",
                      max_write_buffer_number_to_maintain);
+    ROCKS_LOG_HEADER(log,
+                     "    Options.max_write_buffer_size_to_maintain: %" PRIu64,
+                     max_write_buffer_size_to_maintain);
     ROCKS_LOG_HEADER(
         log, "           Options.bottommost_compression_opts.window_bits: %d",
         bottommost_compression_opts.window_bits);
@@ -215,9 +216,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
     ROCKS_LOG_HEADER(
         log, "               Options.max_bytes_for_level_base: %" PRIu64,
         max_bytes_for_level_base);
-    ROCKS_LOG_HEADER(
-        log, "                     Options.snap_refresh_nanos: %" PRIu64,
-        snap_refresh_nanos);
     ROCKS_LOG_HEADER(log, "Options.level_compaction_dynamic_level_bytes: %d",
                      level_compaction_dynamic_level_bytes);
     ROCKS_LOG_HEADER(log, "         Options.max_bytes_for_level_multiplier: %f",
@@ -493,7 +491,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb(
   write_buffer_size = 2 << 20;
   target_file_size_base = 2 * 1048576;
   max_bytes_for_level_base = 10 * 1048576;
-  snap_refresh_nanos = 0;
   soft_pending_compaction_bytes_limit = 256 * 1048576;
   hard_pending_compaction_bytes_limit = 1073741824ul;
 
@@ -506,7 +503,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb(
       BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
   table_factory.reset(new BlockBasedTableFactory(table_options));
 
-
   return this;
 }
 
@@ -552,7 +548,10 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction(
     if (i < 2) {
       compression_per_level[i] = kNoCompression;
     } else {
-      compression_per_level[i] = kSnappyCompression;
+      compression_per_level[i] =
+          LZ4_Supported()
+              ? kLZ4Compression
+              : (Snappy_Supported() ? kSnappyCompression : kNoCompression);
     }
   }
   return this;
@@ -597,7 +596,8 @@ ReadOptions::ReadOptions()
       pin_data(false),
       background_purge_on_iterator_cleanup(false),
       ignore_range_deletions(false),
-      iter_start_seqnum(0) {}
+      iter_start_seqnum(0),
+      timestamp(nullptr) {}
 
 ReadOptions::ReadOptions(bool cksum, bool cache)
     : snapshot(nullptr),
@@ -615,6 +615,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache)
       pin_data(false),
       background_purge_on_iterator_cleanup(false),
       ignore_range_deletions(false),
-      iter_start_seqnum(0) {}
+      iter_start_seqnum(0),
+      timestamp(nullptr) {}
 
 }  // namespace rocksdb
diff --git a/options/options_helper.cc b/options/options_helper.cc
index a973bbfde51..d30264e8234 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -9,6 +9,7 @@
 #include <cstdlib>
 #include <unordered_set>
 #include <vector>
+
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/convenience.h"
@@ -20,8 +21,8 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/object_registry.h"
-#include "table/block_based_table_factory.h"
-#include "table/plain_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
 
@@ -83,6 +84,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.stats_dump_period_sec = mutable_db_options.stats_dump_period_sec;
   options.stats_persist_period_sec =
       mutable_db_options.stats_persist_period_sec;
+  options.persist_stats_to_disk = immutable_db_options.persist_stats_to_disk;
   options.stats_history_buffer_size =
       mutable_db_options.stats_history_buffer_size;
   options.advise_random_on_open = immutable_db_options.advise_random_on_open;
@@ -103,10 +105,13 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.enable_thread_tracking = immutable_db_options.enable_thread_tracking;
   options.delayed_write_rate = mutable_db_options.delayed_write_rate;
   options.enable_pipelined_write = immutable_db_options.enable_pipelined_write;
+  options.unordered_write = immutable_db_options.unordered_write;
   options.allow_concurrent_memtable_write =
       immutable_db_options.allow_concurrent_memtable_write;
   options.enable_write_thread_adaptive_yield =
       immutable_db_options.enable_write_thread_adaptive_yield;
+  options.max_write_batch_group_size_bytes =
+      immutable_db_options.max_write_batch_group_size_bytes;
   options.write_thread_max_yield_usec =
       immutable_db_options.write_thread_max_yield_usec;
   options.write_thread_slow_yield_usec =
@@ -135,7 +140,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.atomic_flush = immutable_db_options.atomic_flush;
   options.avoid_unnecessary_blocking_io =
       immutable_db_options.avoid_unnecessary_blocking_io;
-
+  options.log_readahead_size = immutable_db_options.log_readahead_size;
   return options;
 }
 
@@ -177,7 +182,6 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
       mutable_cf_options.target_file_size_multiplier;
   cf_opts.max_bytes_for_level_base =
       mutable_cf_options.max_bytes_for_level_base;
-  cf_opts.snap_refresh_nanos = mutable_cf_options.snap_refresh_nanos;
   cf_opts.max_bytes_for_level_multiplier =
       mutable_cf_options.max_bytes_for_level_multiplier;
   cf_opts.ttl = mutable_cf_options.ttl;
@@ -254,7 +258,7 @@ const std::string kNameMergeOperator = "merge_operator";
 template <typename T>
 Status GetStringFromStruct(
     std::string* opt_string, const T& options,
-    const std::unordered_map<std::string, OptionTypeInfo> type_info,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info,
     const std::string& delimiter);
 
 namespace {
@@ -349,7 +353,7 @@ bool FIFOCompactionOptionsSpecialCase(const std::string& opt_str,
 template <typename T>
 bool SerializeStruct(
     const T& options, std::string* value,
-    std::unordered_map<std::string, OptionTypeInfo> type_info_map) {
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info_map) {
   std::string opt_str;
   Status s = GetStringFromStruct(&opt_str, options, type_info_map, ";");
   if (!s.ok()) {
@@ -362,7 +366,7 @@ bool SerializeStruct(
 template <typename T>
 bool ParseSingleStructOption(
     const std::string& opt_val_str, T* options,
-    std::unordered_map<std::string, OptionTypeInfo> type_info_map) {
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info_map) {
   size_t end = opt_val_str.find('=');
   std::string key = opt_val_str.substr(0, end);
   std::string value = opt_val_str.substr(end + 1);
@@ -371,6 +375,11 @@ bool ParseSingleStructOption(
     return false;
   }
   const auto& opt_info = iter->second;
+  if (opt_info.verification == OptionVerificationType::kDeprecated) {
+    // Should also skip deprecated sub-options such as
+    // fifo_compaction_options_type_info.ttl
+    return true;
+  }
   return ParseOptionHelper(
       reinterpret_cast<char*>(options) + opt_info.mutable_offset, opt_info.type,
       value);
@@ -379,7 +388,7 @@ bool ParseSingleStructOption(
 template <typename T>
 bool ParseStructOptions(
     const std::string& opt_str, T* options,
-    std::unordered_map<std::string, OptionTypeInfo> type_info_map) {
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info_map) {
   assert(!opt_str.empty());
 
   size_t start = 0;
@@ -1037,21 +1046,21 @@ Status ParseColumnFamilyOption(const std::string& name,
     } else {
       if (name == kNameComparator) {
         // Try to get comparator from object registry first.
-        std::unique_ptr<const Comparator> comp_guard;
-        const Comparator* comp =
-            NewCustomObject<const Comparator>(value, &comp_guard);
         // Only support static comparator for now.
-        if (comp != nullptr && !comp_guard) {
-          new_options->comparator = comp;
+        Status status = ObjectRegistry::NewInstance()->NewStaticObject(
+            value, &new_options->comparator);
+        if (status.ok()) {
+          return status;
         }
       } else if (name == kNameMergeOperator) {
         // Try to get merge operator from object registry first.
-        std::unique_ptr<std::shared_ptr<MergeOperator>> mo_guard;
-        std::shared_ptr<MergeOperator>* mo =
-            NewCustomObject<std::shared_ptr<MergeOperator>>(value, &mo_guard);
+        std::shared_ptr<MergeOperator> mo;
+        Status status =
+            ObjectRegistry::NewInstance()->NewSharedObject<MergeOperator>(
+                value, &new_options->merge_operator);
         // Only support static comparator for now.
-        if (mo != nullptr) {
-          new_options->merge_operator = *mo;
+        if (status.ok()) {
+          return status;
         }
       }
 
@@ -1091,7 +1100,7 @@ Status ParseColumnFamilyOption(const std::string& name,
 template <typename T>
 bool SerializeSingleStructOption(
     std::string* opt_string, const T& options,
-    const std::unordered_map<std::string, OptionTypeInfo> type_info,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info,
     const std::string& name, const std::string& delimiter) {
   auto iter = type_info.find(name);
   if (iter == type_info.end()) {
@@ -1111,7 +1120,7 @@ bool SerializeSingleStructOption(
 template <typename T>
 Status GetStringFromStruct(
     std::string* opt_string, const T& options,
-    const std::unordered_map<std::string, OptionTypeInfo> type_info,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_info,
     const std::string& delimiter) {
   assert(opt_string);
   opt_string->clear();
@@ -1183,10 +1192,10 @@ Status ParseDBOption(const std::string& name,
           NewGenericRateLimiter(static_cast<int64_t>(ParseUint64(value))));
     } else if (name == kNameEnv) {
       // Currently `Env` can be deserialized from object registry only.
-      std::unique_ptr<Env> env_guard;
-      Env* env = NewCustomObject<Env>(value, &env_guard);
+      Env* env = new_options->env;
+      Status status = Env::LoadEnv(value, &env);
       // Only support static env for now.
-      if (env != nullptr && !env_guard) {
+      if (status.ok()) {
         new_options->env = env;
       }
     } else {
@@ -1573,6 +1582,10 @@ std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct DBOptions, stats_persist_period_sec),
           OptionType::kUInt, OptionVerificationType::kNormal, true,
           offsetof(struct MutableDBOptions, stats_persist_period_sec)}},
+        {"persist_stats_to_disk",
+         {offsetof(struct DBOptions, persist_stats_to_disk),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false,
+          offsetof(struct ImmutableDBOptions, persist_stats_to_disk)}},
         {"stats_history_buffer_size",
          {offsetof(struct DBOptions, stats_history_buffer_size),
           OptionType::kSizeT, OptionVerificationType::kNormal, true,
@@ -1583,6 +1596,9 @@ std::unordered_map<std::string, OptionTypeInfo>
         {"enable_pipelined_write",
          {offsetof(struct DBOptions, enable_pipelined_write),
           OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"unordered_write",
+         {offsetof(struct DBOptions, unordered_write), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
         {"allow_concurrent_memtable_write",
          {offsetof(struct DBOptions, allow_concurrent_memtable_write),
           OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
@@ -1596,6 +1612,9 @@ std::unordered_map<std::string, OptionTypeInfo>
         {"write_thread_slow_yield_usec",
          {offsetof(struct DBOptions, write_thread_slow_yield_usec),
           OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
+        {"max_write_batch_group_size_bytes",
+         {offsetof(struct DBOptions, max_write_batch_group_size_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
         {"write_thread_max_yield_usec",
          {offsetof(struct DBOptions, write_thread_max_yield_usec),
           OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
@@ -1649,6 +1668,12 @@ std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct DBOptions, avoid_unnecessary_blocking_io),
           OptionType::kBoolean, OptionVerificationType::kNormal, false,
           offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}},
+        {"write_dbid_to_manifest",
+         {offsetof(struct DBOptions, write_dbid_to_manifest),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"log_readahead_size",
+         {offsetof(struct DBOptions, log_readahead_size), OptionType::kSizeT,
+          OptionVerificationType::kNormal, false, 0}},
 };
 
 std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
@@ -1656,7 +1681,9 @@ std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
         {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
         {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
         {"kTwoLevelIndexSearch",
-         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}};
+         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch},
+        {"kBinarySearchWithFirstKey",
+         BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
 
 std::unordered_map<std::string, BlockBasedTableOptions::DataBlockIndexType>
     OptionsHelper::block_base_table_data_block_index_type_string_map = {
@@ -1853,6 +1880,9 @@ std::unordered_map<std::string, OptionTypeInfo>
         {"max_write_buffer_number_to_maintain",
          {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain),
           OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"max_write_buffer_size_to_maintain",
+         {offset_of(&ColumnFamilyOptions::max_write_buffer_size_to_maintain),
+          OptionType::kInt64T, OptionVerificationType::kNormal, false, 0}},
         {"min_write_buffer_number_to_merge",
          {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge),
           OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
@@ -1912,9 +1942,8 @@ std::unordered_map<std::string, OptionTypeInfo>
           OptionType::kUInt64T, OptionVerificationType::kNormal, true,
           offsetof(struct MutableCFOptions, max_bytes_for_level_base)}},
         {"snap_refresh_nanos",
-         {offset_of(&ColumnFamilyOptions::snap_refresh_nanos),
-          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-          offsetof(struct MutableCFOptions, snap_refresh_nanos)}},
+         {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, true,
+          0}},
         {"max_bytes_for_level_multiplier",
          {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier),
           OptionType::kDouble, OptionVerificationType::kNormal, true,
diff --git a/options/options_parser.cc b/options/options_parser.cc
index f09e53e4a49..6d38f019265 100644
--- a/options/options_parser.cc
+++ b/options/options_parser.cc
@@ -13,13 +13,14 @@
 #include <utility>
 #include <vector>
 
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
 #include "options/options_helper.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
-#include "util/file_reader_writer.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 
 #include "port/port.h"
 
diff --git a/options/options_parser.h b/options/options_parser.h
index 5aab3e7e9b6..b2a806f179f 100644
--- a/options/options_parser.h
+++ b/options/options_parser.h
@@ -12,7 +12,7 @@
 #include "options/options_sanity_check.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 
 namespace rocksdb {
 
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 2d6cc11c02e..bc2e088a6e4 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -7,15 +7,11 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include <cstring>
 
 #include "options/options_helper.h"
 #include "rocksdb/convenience.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
@@ -233,6 +229,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "delete_obsolete_files_period_micros=4294967758;"
                              "WAL_ttl_seconds=4295008036;"
                              "WAL_size_limit_MB=4295036161;"
+                             "max_write_batch_group_size_bytes=1048576;"
                              "wal_dir=path/to/wal_dir;"
                              "db_write_buffer_size=2587;"
                              "max_subcompactions=64330;"
@@ -269,6 +266,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "allow_mmap_writes=false;"
                              "stats_dump_period_sec=70127;"
                              "stats_persist_period_sec=54321;"
+                             "persist_stats_to_disk=true;"
                              "stats_history_buffer_size=14159;"
                              "allow_fallocate=true;"
                              "allow_mmap_reads=false;"
@@ -279,6 +277,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "advise_random_on_open=true;"
                              "fail_if_options_file_error=false;"
                              "enable_pipelined_write=false;"
+                             "unordered_write=false;"
                              "allow_concurrent_memtable_write=true;"
                              "wal_recovery_mode=kPointInTimeRecovery;"
                              "enable_write_thread_adaptive_yield=true;"
@@ -297,7 +296,9 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "manual_wal_flush=false;"
                              "seq_per_batch=false;"
                              "atomic_flush=false;"
-                             "avoid_unnecessary_blocking_io=false",
+                             "avoid_unnecessary_blocking_io=false;"
+                             "log_readahead_size=0;"
+                             "write_dbid_to_manifest=false",
                              new_options));
 
   ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
@@ -352,10 +353,10 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
        sizeof(std::shared_ptr<CompactionFilterFactory>)},
       {offset_of(&ColumnFamilyOptions::prefix_extractor),
        sizeof(std::shared_ptr<const SliceTransform>)},
+      {offset_of(&ColumnFamilyOptions::snap_refresh_nanos), sizeof(uint64_t)},
       {offset_of(&ColumnFamilyOptions::table_factory),
        sizeof(std::shared_ptr<TableFactory>)},
-      {offset_of(&ColumnFamilyOptions::cf_paths),
-       sizeof(std::vector<DbPath>)},
+      {offset_of(&ColumnFamilyOptions::cf_paths), sizeof(std::vector<DbPath>)},
       {offset_of(&ColumnFamilyOptions::compaction_thread_limiter),
        sizeof(std::shared_ptr<ConcurrentTaskLimiter>)},
   };
@@ -415,7 +416,6 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:"
       "kSnappyCompression;"
       "max_bytes_for_level_base=986;"
-      "snap_refresh_nanos=1000000000;"
       "bloom_locality=8016;"
       "target_file_size_base=4294976376;"
       "memtable_huge_page_size=2557;"
@@ -439,6 +439,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "soft_rate_limit=530.615385;"
       "soft_pending_compaction_bytes_limit=0;"
       "max_write_buffer_number_to_maintain=84;"
+      "max_write_buffer_size_to_maintain=2147483648;"
       "merge_operator=aabcxehazrMergeOperator;"
       "memtable_prefix_bloom_size_ratio=0.4642;"
       "memtable_whole_key_filtering=true;"
diff --git a/options/options_test.cc b/options/options_test.cc
index ded336dd18d..d1c9db039d4 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -7,14 +7,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include <cctype>
+#include <cinttypes>
 #include <cstring>
 #include <unordered_map>
-#include <inttypes.h>
 
 #include "cache/lru_cache.h"
 #include "cache/sharded_cache.h"
@@ -27,11 +23,12 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/utilities/leveldb_options.h"
 #include "rocksdb/utilities/object_registry.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators/bytesxor.h"
 
 #ifndef GFLAGS
@@ -53,6 +50,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"max_write_buffer_number", "2"},
       {"min_write_buffer_number_to_merge", "3"},
       {"max_write_buffer_number_to_maintain", "99"},
+      {"max_write_buffer_size_to_maintain", "-99999"},
       {"compression", "kSnappyCompression"},
       {"compression_per_level",
        "kNoCompression:"
@@ -74,7 +72,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"target_file_size_base", "12"},
       {"target_file_size_multiplier", "13"},
       {"max_bytes_for_level_base", "14"},
-      {"snap_refresh_nanos", "1000000000"},
       {"level_compaction_dynamic_level_bytes", "true"},
       {"max_bytes_for_level_multiplier", "15.0"},
       {"max_bytes_for_level_multiplier_additional", "16:17:18"},
@@ -133,6 +130,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"skip_log_error_on_recovery", "false"},
       {"stats_dump_period_sec", "46"},
       {"stats_persist_period_sec", "57"},
+      {"persist_stats_to_disk", "false"},
       {"stats_history_buffer_size", "69"},
       {"advise_random_on_open", "true"},
       {"use_adaptive_mutex", "false"},
@@ -153,6 +151,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999);
   ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
   ASSERT_EQ(new_cf_opt.compression_per_level.size(), 9U);
   ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression);
@@ -167,15 +166,15 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
   ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
   ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
-  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7);
-  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8);
+  ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7u);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8u);
   ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
   ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
-  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8);
-  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8u);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9u);
   ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true);
   ASSERT_EQ(new_cf_opt.num_levels, 8);
   ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
@@ -184,7 +183,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
   ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
-  ASSERT_EQ(new_cf_opt.snap_refresh_nanos, 1000000000U);
   ASSERT_EQ(new_cf_opt.level_compaction_dynamic_level_bytes, true);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15.0);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U);
@@ -271,6 +269,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false);
   ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
   ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U);
+  ASSERT_EQ(new_db_opt.persist_stats_to_disk, false);
   ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U);
   ASSERT_EQ(new_db_opt.advise_random_on_open, true);
   ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
@@ -343,11 +342,11 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
 
   // Comparator from object registry
   std::string kCompName = "reverse_comp";
-  static Registrar<const Comparator> test_reg_a(
-      kCompName, [](const std::string& /*name*/,
-                    std::unique_ptr<const Comparator>* /*comparator_guard*/) {
-        return ReverseBytewiseComparator();
-      });
+  ObjectLibrary::Default()->Register<const Comparator>(
+      kCompName,
+      [](const std::string& /*name*/,
+         std::unique_ptr<const Comparator>* /*guard*/,
+         std::string* /* errmsg */) { return ReverseBytewiseComparator(); });
 
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt));
@@ -356,13 +355,12 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
   // MergeOperator from object registry
   std::unique_ptr<BytesXOROperator> bxo(new BytesXOROperator());
   std::string kMoName = bxo->Name();
-  static Registrar<std::shared_ptr<MergeOperator>> test_reg_b(
-      kMoName, [](const std::string& /*name*/,
-                  std::unique_ptr<std::shared_ptr<MergeOperator>>*
-                      merge_operator_guard) {
-        merge_operator_guard->reset(
-            new std::shared_ptr<MergeOperator>(new BytesXOROperator()));
-        return merge_operator_guard->get();
+  ObjectLibrary::Default()->Register<MergeOperator>(
+      kMoName,
+      [](const std::string& /*name*/, std::unique_ptr<MergeOperator>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new BytesXOROperator());
+        return guard->get();
       });
 
   ASSERT_OK(GetColumnFamilyOptionsFromString(
@@ -384,10 +382,10 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
              "write_buffer_size=13; =100;", &new_cf_opt));
   ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
 
-  const int64_t kilo = 1024UL;
-  const int64_t mega = 1024 * kilo;
-  const int64_t giga = 1024 * mega;
-  const int64_t tera = 1024 * giga;
+  const uint64_t kilo = 1024UL;
+  const uint64_t mega = 1024 * kilo;
+  const uint64_t giga = 1024 * mega;
+  const uint64_t tera = 1024 * giga;
 
   // Units (k)
   ASSERT_OK(GetColumnFamilyOptionsFromString(
@@ -398,7 +396,7 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
             "max_write_buffer_number=16m;inplace_update_num_locks=17M",
             &new_cf_opt));
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega);
-  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17 * mega);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega);
   // Units (g)
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       base_cf_opt,
@@ -518,13 +516,15 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   BlockBasedTableOptions table_opt;
   BlockBasedTableOptions new_opt;
   // make sure default values are overwritten by something else
-  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
-            "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
-            "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
-            "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
-            "block_size_deviation=8;block_restart_interval=4;"
-            "filter_policy=bloomfilter:4:true;whole_key_filtering=1;",
-            &new_opt));
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      table_opt,
+      "cache_index_and_filter_blocks=1;index_type=kHashSearch;"
+      "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
+      "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
+      "block_size_deviation=8;block_restart_interval=4;"
+      "format_version=5;whole_key_filtering=1;"
+      "filter_policy=bloomfilter:4.567:false;",
+      &new_opt));
   ASSERT_TRUE(new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch);
   ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash);
@@ -537,14 +537,20 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_EQ(new_opt.block_size, 1024UL);
   ASSERT_EQ(new_opt.block_size_deviation, 8);
   ASSERT_EQ(new_opt.block_restart_interval, 4);
+  ASSERT_EQ(new_opt.format_version, 5U);
+  ASSERT_EQ(new_opt.whole_key_filtering, true);
   ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  const BloomFilterPolicy& bfp =
+      dynamic_cast<const BloomFilterPolicy&>(*new_opt.filter_policy);
+  EXPECT_EQ(bfp.GetMillibitsPerKey(), 4567);
+  EXPECT_EQ(bfp.GetWholeBitsPerKey(), 5);
 
   // unknown option
   ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,
              "cache_index_and_filter_blocks=1;index_type=kBinarySearch;"
              "bad_option=1",
              &new_opt));
-  ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
+  ASSERT_EQ(static_cast<bool>(table_opt.cache_index_and_filter_blocks),
             new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(table_opt.index_type, new_opt.index_type);
 
@@ -619,8 +625,9 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
                 new_opt.block_cache)->GetNumShardBits(),
                 GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache)->GetHighPriPoolRatio(), 0.0);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
   // Default values
@@ -629,16 +636,17 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
                 GetDefaultCacheShardBits(
                     new_opt.block_cache_compressed->GetCapacity()));
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
-                0.0);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
 
   // Set couple of block cache options.
-  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
-             "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};"
-             "block_cache_compressed={num_shard_bits=5;"
-             "high_pri_pool_ratio=0.5;}",
-             &new_opt));
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      table_opt,
+      "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};"
+      "block_cache_compressed={num_shard_bits=5;"
+      "high_pri_pool_ratio=0.0;}",
+      &new_opt));
   ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
                 new_opt.block_cache)->GetNumShardBits(), 5);
@@ -650,9 +658,9 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
                 new_opt.block_cache_compressed)->GetNumShardBits(), 5);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
-                0.5);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.0);
 
   // Set couple of block cache options.
   ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
@@ -666,16 +674,17 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
                 new_opt.block_cache)->GetNumShardBits(), 4);
   ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache)->GetHighPriPoolRatio(), 0.0);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache)
+                ->GetHighPriPoolRatio(),
+            0.5);
   ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
   ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
   ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
                 new_opt.block_cache_compressed)->GetNumShardBits(), 4);
   ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
-  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
-                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
-                0.0);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(new_opt.block_cache_compressed)
+                ->GetHighPriPoolRatio(),
+            0.5);
 }
 #endif  // !ROCKSDB_LITE
 
@@ -690,7 +699,7 @@ TEST_F(OptionsTest, GetPlainTableOptionsFromString) {
             "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;"
             "full_scan_mode=true;store_index_in_file=true",
             &new_opt));
-  ASSERT_EQ(new_opt.user_key_len, 66);
+  ASSERT_EQ(new_opt.user_key_len, 66u);
   ASSERT_EQ(new_opt.bloom_bits_per_key, 20);
   ASSERT_EQ(new_opt.hash_table_ratio, 0.5);
   ASSERT_EQ(new_opt.index_sparseness, 8);
@@ -769,9 +778,10 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) {
     explicit CustomEnv(Env* _target) : EnvWrapper(_target) {}
   };
 
-  static Registrar<Env> test_reg_env(
+  ObjectLibrary::Default()->Register<Env>(
       kCustomEnvName,
-      [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/) {
+      [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
+         std::string* /* errmsg */) {
         static CustomEnv env(Env::Default());
         return &env;
       });
@@ -789,15 +799,15 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_EQ(new_options.compression_opts.window_bits, 4);
   ASSERT_EQ(new_options.compression_opts.level, 5);
   ASSERT_EQ(new_options.compression_opts.strategy, 6);
-  ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0);
-  ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0);
+  ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0u);
   ASSERT_EQ(new_options.compression_opts.enabled, false);
   ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption);
   ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5);
   ASSERT_EQ(new_options.bottommost_compression_opts.level, 6);
   ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7);
-  ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0);
-  ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0);
+  ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0u);
+  ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0u);
   ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false);
   ASSERT_EQ(new_options.write_buffer_size, 10U);
   ASSERT_EQ(new_options.max_write_buffer_number, 16);
@@ -812,8 +822,9 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_EQ(new_options.create_if_missing, true);
   ASSERT_EQ(new_options.max_open_files, 1);
   ASSERT_TRUE(new_options.rate_limiter.get() != nullptr);
-  std::unique_ptr<Env> env_guard;
-  ASSERT_EQ(NewCustomObject<Env>(kCustomEnvName, &env_guard), new_options.env);
+  Env* newEnv = new_options.env;
+  ASSERT_OK(Env::LoadEnv(kCustomEnvName, &newEnv));
+  ASSERT_EQ(newEnv, new_options.env);
 }
 
 TEST_F(OptionsTest, DBOptionsSerialization) {
@@ -842,7 +853,7 @@ TEST_F(OptionsTest, OptionsComposeDecompose) {
 
   Random rnd(301);
   test::RandomInitDBOptions(&base_db_opts, &rnd);
-  test::RandomInitCFOptions(&base_cf_opts, &rnd);
+  test::RandomInitCFOptions(&base_cf_opts, base_db_opts, &rnd);
 
   Options base_opts(base_db_opts, base_cf_opts);
   DBOptions new_db_opts(base_opts);
@@ -854,11 +865,12 @@ TEST_F(OptionsTest, OptionsComposeDecompose) {
 }
 
 TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) {
+  Options options;
   ColumnFamilyOptions base_opt, new_opt;
   Random rnd(302);
   // Phase 1: randomly assign base_opt
   // custom type options
-  test::RandomInitCFOptions(&base_opt, &rnd);
+  test::RandomInitCFOptions(&base_opt, options, &rnd);
 
   // Phase 2: obtain a string from base_opt
   std::string base_options_file_content;
@@ -1521,7 +1533,7 @@ TEST_F(OptionsParserTest, DumpAndParse) {
   for (int c = 0; c < num_cf; ++c) {
     ColumnFamilyOptions cf_opt;
     Random cf_rnd(0xFB + c);
-    test::RandomInitCFOptions(&cf_opt, &cf_rnd);
+    test::RandomInitCFOptions(&cf_opt, base_db_opt, &cf_rnd);
     if (c < 4) {
       cf_opt.prefix_extractor.reset(test::RandomSliceTransform(&rnd, c));
     }
@@ -1869,9 +1881,9 @@ TEST_F(OptionsParserTest, IntegerParsing) {
   ASSERT_EQ(ParseUint64("18446744073709551615"), 18446744073709551615U);
   ASSERT_EQ(ParseUint32("4294967295"), 4294967295U);
   ASSERT_EQ(ParseSizeT("18446744073709551615"), 18446744073709551615U);
-  ASSERT_EQ(ParseInt64("9223372036854775807"), 9223372036854775807U);
+  ASSERT_EQ(ParseInt64("9223372036854775807"), 9223372036854775807);
   ASSERT_EQ(ParseInt64("-9223372036854775808"), port::kMinInt64);
-  ASSERT_EQ(ParseInt32("2147483647"), 2147483647U);
+  ASSERT_EQ(ParseInt32("2147483647"), 2147483647);
   ASSERT_EQ(ParseInt32("-2147483648"), port::kMinInt32);
   ASSERT_EQ(ParseInt("-32767"), -32767);
   ASSERT_EQ(ParseDouble("-1.234567"), -1.234567);
diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h
index 0c216face13..f6f72f8cb8c 100644
--- a/port/jemalloc_helper.h
+++ b/port/jemalloc_helper.h
@@ -5,10 +5,24 @@
 
 #pragma once
 
+#if defined(__clang__)
+// glibc's `posix_memalign()` declaration specifies `throw()` while clang's
+// declaration does not. There is a hack in clang to make its re-declaration
+// compatible with glibc's if they are declared consecutively. That hack breaks
+// if yet another `posix_memalign()` declaration comes between glibc's and
+// clang's declarations. Include "mm_malloc.h" here ensures glibc's and clang's
+// declarations both come before "jemalloc.h"'s `posix_memalign()` declaration.
+//
+// This problem could also be avoided if "jemalloc.h"'s `posix_memalign()`
+// declaration did not specify `throw()` when built with clang.
+#include <mm_malloc.h>
+#endif
+
 #ifdef ROCKSDB_JEMALLOC
 #ifdef __FreeBSD__
 #include <malloc_np.h>
 #else
+#define JEMALLOC_MANGLE
 #include <jemalloc/jemalloc.h>
 #endif
 
@@ -16,6 +30,14 @@
 #define JEMALLOC_CXX_THROW
 #endif
 
+#if defined(OS_WIN) && defined(_MSC_VER)
+
+// MSVC does not have weak symbol support. As long as ROCKSDB_JEMALLOC is
+// defined, Jemalloc memory allocator is used.
+static inline bool HasJemalloc() { return true; }
+
+#else
+
 // Declare non-standard jemalloc APIs as weak symbols. We can null-check these
 // symbols to detect whether jemalloc is linked with the binary.
 extern "C" void* mallocx(size_t, int) __attribute__((__weak__));
@@ -50,4 +72,6 @@ static inline bool HasJemalloc() {
          malloc_stats_print != nullptr && malloc_usable_size != nullptr;
 }
 
+#endif
+
 #endif  // ROCKSDB_JEMALLOC
diff --git a/util/filter_policy.cc b/port/malloc.h
similarity index 63%
rename from util/filter_policy.cc
rename to port/malloc.h
index efb9bf4763c..f973263e2ae 100644
--- a/util/filter_policy.cc
+++ b/port/malloc.h
@@ -3,14 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
 
-#include "rocksdb/filter_policy.h"
-
-namespace rocksdb {
-
-FilterPolicy::~FilterPolicy() { }
-
-}  // namespace rocksdb
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else
+#include <malloc.h>
+#endif  // OS_FREEBSD
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
diff --git a/port/port_posix.cc b/port/port_posix.cc
index 80081e480e0..167159d83c8 100644
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
@@ -18,11 +18,11 @@
 #include <signal.h>
 #include <stdio.h>
 #include <string.h>
-#include <sys/time.h>
 #include <sys/resource.h>
+#include <sys/time.h>
 #include <unistd.h>
 #include <cstdlib>
-#include "util/logging.h"
+#include "logging/logging.h"
 
 namespace rocksdb {
 
@@ -192,7 +192,8 @@ int GetMaxOpenFiles() {
     return -1;
   }
   // protect against overflow
-  if (no_files_limit.rlim_cur >= std::numeric_limits<int>::max()) {
+  if (static_cast<uintmax_t>(no_files_limit.rlim_cur) >=
+      static_cast<uintmax_t>(std::numeric_limits<int>::max())) {
     return std::numeric_limits<int>::max();
   }
   return static_cast<int>(no_files_limit.rlim_cur);
diff --git a/port/port_posix.h b/port/port_posix.h
index 63d7239fe6d..892089831ff 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -104,6 +104,10 @@ class CondVar;
 class Mutex {
  public:
   explicit Mutex(bool adaptive = kDefaultToAdaptiveMutex);
+  // No copying
+  Mutex(const Mutex&) = delete;
+  void operator=(const Mutex&) = delete;
+
   ~Mutex();
 
   void Lock();
@@ -118,15 +122,15 @@ class Mutex {
 #ifndef NDEBUG
   bool locked_;
 #endif
-
-  // No copying
-  Mutex(const Mutex&);
-  void operator=(const Mutex&);
 };
 
 class RWMutex {
  public:
   RWMutex();
+  // No copying allowed
+  RWMutex(const RWMutex&) = delete;
+  void operator=(const RWMutex&) = delete;
+
   ~RWMutex();
 
   void ReadLock();
@@ -137,10 +141,6 @@ class RWMutex {
 
  private:
   pthread_rwlock_t mu_; // the underlying platform mutex
-
-  // No copying allowed
-  RWMutex(const RWMutex&);
-  void operator=(const RWMutex&);
 };
 
 class CondVar {
@@ -178,22 +178,31 @@ typedef pthread_once_t OnceType;
 extern void InitOnce(OnceType* once, void (*initializer)());
 
 #ifndef CACHE_LINE_SIZE
-  #if defined(__s390__)
-    #define CACHE_LINE_SIZE 256U
-  #elif defined(__powerpc__) || defined(__aarch64__)
-    #define CACHE_LINE_SIZE 128U
-  #else
-    #define CACHE_LINE_SIZE 64U
-  #endif
+// To test behavior with non-native cache line size, e.g. for
+// Bloom filters, set TEST_CACHE_LINE_SIZE to the desired test size.
+// This disables ALIGN_AS to keep it from failing compilation.
+#ifdef TEST_CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE TEST_CACHE_LINE_SIZE
+#define ALIGN_AS(n) /*empty*/
+#else
+#if defined(__s390__)
+#define CACHE_LINE_SIZE 256U
+#elif defined(__powerpc__) || defined(__aarch64__)
+#define CACHE_LINE_SIZE 128U
+#else
+#define CACHE_LINE_SIZE 64U
+#endif
+#define ALIGN_AS(n) alignas(n)
+#endif
 #endif
 
+static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0,
+              "Cache line size must be a power of 2 number of bytes");
 
 extern void *cacheline_aligned_alloc(size_t size);
 
 extern void cacheline_aligned_free(void *memblock);
 
-#define ALIGN_AS(n) alignas(n)
-
 #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
 
 extern void Crash(const std::string& srcfile, int srcline);
diff --git a/port/util_logger.h b/port/util_logger.h
index ba424705b27..d2d62a9879c 100644
--- a/port/util_logger.h
+++ b/port/util_logger.h
@@ -14,7 +14,7 @@
 // of what the new port_<platform>.h file must provide.
 
 #if defined(ROCKSDB_PLATFORM_POSIX)
-#include "env/posix_logger.h"
+#include "logging/posix_logger.h"
 #elif defined(OS_WIN)
 #include "port/win/win_logger.h"
 #endif
diff --git a/port/win/env_default.cc b/port/win/env_default.cc
index d24c21918aa..584a524cf86 100644
--- a/port/win/env_default.cc
+++ b/port/win/env_default.cc
@@ -11,8 +11,8 @@
 
 #include <rocksdb/env.h>
 #include "port/win/env_win.h"
+#include "test_util/sync_point.h"
 #include "util/compression_context_cache.h"
-#include "util/sync_point.h"
 #include "util/thread_local.h"
 
 namespace rocksdb {
diff --git a/port/win/env_win.cc b/port/win/env_win.cc
index 9abb14d67ea..c12d0ee4fc2 100644
--- a/port/win/env_win.cc
+++ b/port/win/env_win.cc
@@ -979,8 +979,8 @@ uint64_t WinEnvIO::NowMicros() {
     return li.QuadPart;
   }
   using namespace std::chrono;
-  return duration_cast<microseconds>(
-      high_resolution_clock::now().time_since_epoch()).count();
+  return duration_cast<microseconds>(system_clock::now().time_since_epoch())
+      .count();
 }
 
 uint64_t WinEnvIO::NowNanos() {
diff --git a/port/win/io_win.cc b/port/win/io_win.cc
index 64ded8465d0..c433e5e522f 100644
--- a/port/win/io_win.cc
+++ b/port/win/io_win.cc
@@ -10,9 +10,9 @@
 #include "port/win/io_win.h"
 
 #include "monitoring/iostats_context_imp.h"
+#include "test_util/sync_point.h"
 #include "util/aligned_buffer.h"
 #include "util/coding.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 namespace port {
@@ -175,60 +175,18 @@ Status ftruncate(const std::string& filename, HANDLE hFile,
   return status;
 }
 
-size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
-
-  if (max_size < kMaxVarint64Length * 3) {
-    return 0;
-  }
-#if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
-  // MINGGW as defined by CMake file.
-  // yuslepukhin: I hate the guts of the above macros.
-  // This impl does not guarantee uniqueness everywhere
-  // is reasonably good
-  BY_HANDLE_FILE_INFORMATION FileInfo;
-
-  BOOL result = GetFileInformationByHandle(hFile, &FileInfo);
-
-  TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
-
-  if (!result) {
-    return 0;
-  }
-
-  char* rid = id;
-  rid = EncodeVarint64(rid, uint64_t(FileInfo.dwVolumeSerialNumber));
-  rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexHigh));
-  rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexLow));
-
-  assert(rid >= id);
-  return static_cast<size_t>(rid - id);
-#else
-  FILE_ID_INFO FileInfo;
-  BOOL result = GetFileInformationByHandleEx(hFile, FileIdInfo, &FileInfo,
-    sizeof(FileInfo));
-
-  TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
-
-  if (!result) {
-    return 0;
-  }
-
-  static_assert(sizeof(uint64_t) == sizeof(FileInfo.VolumeSerialNumber),
-    "Wrong sizeof expectations");
-  // FileId.Identifier is an array of 16 BYTEs, we encode them as two uint64_t
-  static_assert(sizeof(uint64_t) * 2 == sizeof(FileInfo.FileId.Identifier),
-    "Wrong sizeof expectations");
-
-  char* rid = id;
-  rid = EncodeVarint64(rid, uint64_t(FileInfo.VolumeSerialNumber));
-  uint64_t* file_id = reinterpret_cast<uint64_t*>(&FileInfo.FileId.Identifier[0]);
-  rid = EncodeVarint64(rid, *file_id);
-  ++file_id;
-  rid = EncodeVarint64(rid, *file_id);
-
-  assert(rid >= id);
-  return static_cast<size_t>(rid - id);
-#endif
+size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
+                           size_t /*max_size*/) {
+  // Returning 0 is safe as it causes the table reader to generate a unique ID.
+  // This is suboptimal for performance as it prevents multiple table readers
+  // for the same file from sharing cached blocks. For example, if users have
+  // a low value for `max_open_files`, there can be many table readers opened
+  // for the same file.
+  //
+  // TODO: this is a temporarily solution as it is safe but not optimal for
+  // performance. For more details see discussion in
+  // https://github.com/facebook/rocksdb/pull/5844.
+  return 0;
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1067,7 +1025,12 @@ Status WinRandomRWFile::Close() {
 //////////////////////////////////////////////////////////////////////////
 /// WinMemoryMappedBufer
 WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
-  BOOL ret = FALSE;
+  BOOL ret
+#if defined(_MSC_VER)
+    = FALSE;
+#else
+    __attribute__((__unused__));
+#endif
   if (base_ != nullptr) {
     ret = ::UnmapViewOfFile(base_);
     assert(ret);
diff --git a/port/win/port_win.cc b/port/win/port_win.cc
index 03ba6ef4281..31e65e78cde 100644
--- a/port/win/port_win.cc
+++ b/port/win/port_win.cc
@@ -33,7 +33,7 @@
 #include <codecvt>
 #endif
 
-#include "util/logging.h"
+#include "logging/logging.h"
 
 namespace rocksdb {
 
diff --git a/port/win/port_win.h b/port/win/port_win.h
index de41cdc7f01..1b302b3d211 100644
--- a/port/win/port_win.h
+++ b/port/win/port_win.h
@@ -180,6 +180,9 @@ class Mutex {
 class RWMutex {
  public:
   RWMutex() { InitializeSRWLock(&srwLock_); }
+  // No copying allowed
+  RWMutex(const RWMutex&) = delete;
+  void operator=(const RWMutex&) = delete;
 
   void ReadLock() { AcquireSRWLockShared(&srwLock_); }
 
@@ -194,9 +197,6 @@ class RWMutex {
 
  private:
   SRWLOCK srwLock_;
-  // No copying allowed
-  RWMutex(const RWMutex&);
-  void operator=(const RWMutex&);
 };
 
 class CondVar {
diff --git a/port/win/win_jemalloc.cc b/port/win/win_jemalloc.cc
index 3268a56affd..b2077938806 100644
--- a/port/win/win_jemalloc.cc
+++ b/port/win/win_jemalloc.cc
@@ -1,7 +1,7 @@
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 //
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
diff --git a/port/win/win_thread.cc b/port/win/win_thread.cc
index 9a976e2c6b8..34e54f30171 100644
--- a/port/win/win_thread.cc
+++ b/port/win/win_thread.cc
@@ -138,7 +138,12 @@ void WindowsThread::join() {
       "WaitForSingleObjectFailed: thread join");
   }
 
-  BOOL rc;
+  BOOL rc
+#if defined(_MSC_VER)
+    = FALSE;
+#else
+    __attribute__((__unused__));
+#endif
   rc = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_));
   assert(rc != 0);
   data_->handle_ = 0;
diff --git a/src.mk b/src.mk
index 55b4e3427c6..19f03123730 100644
--- a/src.mk
+++ b/src.mk
@@ -3,27 +3,29 @@ LIB_SOURCES =                                                   \
   cache/clock_cache.cc                                          \
   cache/lru_cache.cc                                            \
   cache/sharded_cache.cc                                        \
+  db/arena_wrapped_db_iter.cc                                   \
   db/builder.cc                                                 \
   db/c.cc                                                       \
   db/column_family.cc                                           \
   db/compacted_db_impl.cc                                       \
-  db/compaction.cc                                              \
-  db/compaction_iterator.cc                                     \
-  db/compaction_job.cc                                          \
-  db/compaction_picker.cc                                       \
-  db/compaction_picker_fifo.cc                                  \
-  db/compaction_picker_universal.cc                             \
+  db/compaction/compaction.cc                                 	\
+  db/compaction/compaction_iterator.cc                          \
+  db/compaction/compaction_job.cc                               \
+  db/compaction/compaction_picker.cc                            \
+  db/compaction/compaction_picker_fifo.cc                       \
+  db/compaction/compaction_picker_level.cc                      \
+  db/compaction/compaction_picker_universal.cc                 	\
   db/convenience.cc                                             \
   db/db_filesnapshot.cc                                         \
-  db/db_impl.cc                                                 \
-  db/db_impl_compaction_flush.cc                                \
-  db/db_impl_debug.cc                                           \
-  db/db_impl_experimental.cc                                    \
-  db/db_impl_files.cc                                           \
-  db/db_impl_open.cc                                            \
-  db/db_impl_readonly.cc                                        \
-  db/db_impl_secondary.cc                                       \
-  db/db_impl_write.cc                                           \
+  db/db_impl/db_impl.cc                                         \
+  db/db_impl/db_impl_compaction_flush.cc                        \
+  db/db_impl/db_impl_debug.cc                                   \
+  db/db_impl/db_impl_experimental.cc                            \
+  db/db_impl/db_impl_files.cc                                   \
+  db/db_impl/db_impl_open.cc                                    \
+  db/db_impl/db_impl_readonly.cc                                \
+  db/db_impl/db_impl_secondary.cc                               \
+  db/db_impl/db_impl_write.cc                                   \
   db/db_info_dumper.cc                                          \
   db/db_iter.cc                                                 \
   db/dbformat.cc                                                \
@@ -35,7 +37,7 @@ LIB_SOURCES =                                                   \
   db/flush_job.cc                                               \
   db/flush_scheduler.cc                                         \
   db/forward_iterator.cc                                        \
-  db/in_memory_stats_history.cc                                 \
+  db/import_column_family_job.cc                                \
   db/internal_stats.cc                                          \
   db/logs_with_prep_tracker.cc                                  \
   db/log_reader.cc                                              \
@@ -52,6 +54,7 @@ LIB_SOURCES =                                                   \
   db/table_cache.cc                                             \
   db/table_properties_collector.cc                              \
   db/transaction_log_impl.cc                                    \
+  db/trim_history_scheduler.cc                                  \
   db/version_builder.cc                                         \
   db/version_edit.cc                                            \
   db/version_set.cc                                             \
@@ -67,6 +70,22 @@ LIB_SOURCES =                                                   \
   env/env_posix.cc                                              \
   env/io_posix.cc                                               \
   env/mock_env.cc                                               \
+  file/delete_scheduler.cc                                      \
+  file/file_prefetch_buffer.cc                                  \
+  file/file_util.cc                                             \
+  file/filename.cc                                              \
+  file/random_access_file_reader.cc                             \
+  file/read_write_util.cc                                       \
+  file/readahead_raf.cc                                         \
+  file/sequence_file_reader.cc                                  \
+  file/sst_file_manager_impl.cc                                 \
+  file/writable_file_writer.cc                                  \
+  logging/auto_roll_logger.cc                                   \
+  logging/event_logger.cc                                       \
+  logging/log_buffer.cc                                         \
+  memory/arena.cc                                               \
+  memory/concurrent_arena.cc                                    \
+  memory/jemalloc_nodump_allocator.cc                           \
   memtable/alloc_tracker.cc                                     \
   memtable/hash_linklist_rep.cc                                 \
   memtable/hash_skiplist_rep.cc                                 \
@@ -75,10 +94,12 @@ LIB_SOURCES =                                                   \
   memtable/write_buffer_manager.cc                              \
   monitoring/histogram.cc                                       \
   monitoring/histogram_windowing.cc                             \
+  monitoring/in_memory_stats_history.cc                         \
   monitoring/instrumented_mutex.cc                              \
   monitoring/iostats_context.cc                                 \
   monitoring/perf_context.cc                                    \
   monitoring/perf_level.cc                                      \
+  monitoring/persistent_stats_history.cc                        \
   monitoring/statistics.cc                                      \
   monitoring/thread_status_impl.cc                              \
   monitoring/thread_status_updater.cc                           \
@@ -93,75 +114,67 @@ LIB_SOURCES =                                                   \
   options/options_sanity_check.cc                               \
   port/port_posix.cc                                            \
   port/stack_trace.cc                                           \
-  table/adaptive_table_factory.cc                               \
-  table/block.cc                                                \
-  table/block_based_filter_block.cc                             \
-  table/block_based_table_builder.cc                            \
-  table/block_based_table_factory.cc                            \
-  table/block_based_table_reader.cc                             \
-  table/block_builder.cc                                        \
-  table/block_fetcher.cc                                        \
-  table/block_prefix_index.cc                                   \
-  table/bloom_block.cc                                          \
-  table/cuckoo_table_builder.cc                                 \
-  table/cuckoo_table_factory.cc                                 \
-  table/cuckoo_table_reader.cc                                  \
-  table/data_block_hash_index.cc                                \
-  table/data_block_footer.cc                                    \
-  table/flush_block_policy.cc                                   \
+  table/adaptive/adaptive_table_factory.cc                      \
+  table/block_based/block.cc                                    \
+  table/block_based/block_based_filter_block.cc                 \
+  table/block_based/block_based_table_builder.cc                \
+  table/block_based/block_based_table_factory.cc                \
+  table/block_based/block_based_table_reader.cc                 \
+  table/block_based/block_builder.cc                            \
+  table/block_based/block_prefix_index.cc                       \
+  table/block_based/data_block_hash_index.cc                    \
+  table/block_based/data_block_footer.cc                        \
+  table/block_based/filter_block_reader_common.cc               \
+  table/block_based/filter_policy.cc                                                 \
+  table/block_based/flush_block_policy.cc                       \
+  table/block_based/full_filter_block.cc                        \
+  table/block_based/index_builder.cc                            \
+  table/block_based/parsed_full_filter_block.cc                 \
+  table/block_based/partitioned_filter_block.cc                 \
+  table/block_based/uncompression_dict_reader.cc                \
+  table/block_fetcher.cc                             		\
+  table/cuckoo/cuckoo_table_builder.cc                          \
+  table/cuckoo/cuckoo_table_factory.cc                          \
+  table/cuckoo/cuckoo_table_reader.cc                           \
   table/format.cc                                               \
-  table/full_filter_block.cc                                    \
   table/get_context.cc                                          \
-  table/index_builder.cc                                        \
   table/iterator.cc                                             \
   table/merging_iterator.cc                                     \
   table/meta_blocks.cc                                          \
-  table/partitioned_filter_block.cc                             \
   table/persistent_cache_helper.cc                              \
-  table/plain_table_builder.cc                                  \
-  table/plain_table_factory.cc                                  \
-  table/plain_table_index.cc                                    \
-  table/plain_table_key_coding.cc                               \
-  table/plain_table_reader.cc                                   \
+  table/plain/plain_table_bloom.cc                              \
+  table/plain/plain_table_builder.cc                            \
+  table/plain/plain_table_factory.cc                            \
+  table/plain/plain_table_index.cc                              \
+  table/plain/plain_table_key_coding.cc                         \
+  table/plain/plain_table_reader.cc                             \
   table/sst_file_reader.cc                                      \
   table/sst_file_writer.cc                                      \
   table/table_properties.cc                                     \
   table/two_level_iterator.cc                                   \
+  test_util/sync_point.cc                                       \
+  test_util/sync_point_impl.cc                                  \
+  test_util/transaction_test_util.cc                            \
   tools/dump/db_dump_tool.cc                                    \
-  util/arena.cc                                                 \
-  util/auto_roll_logger.cc                                      \
-  util/bloom.cc                                                 \
+  trace_replay/trace_replay.cc                                  \
+  trace_replay/block_cache_tracer.cc                            \
   util/build_version.cc                                         \
   util/coding.cc                                                \
   util/compaction_job_stats_impl.cc                             \
   util/comparator.cc                                            \
   util/compression_context_cache.cc                             \
-  util/concurrent_arena.cc                                      \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
-  util/delete_scheduler.cc                                      \
   util/dynamic_bloom.cc                                         \
-  util/event_logger.cc                                          \
-  util/file_reader_writer.cc                                    \
-  util/file_util.cc                                             \
-  util/filename.cc                                              \
-  util/filter_policy.cc                                         \
   util/hash.cc                                                  \
-  util/jemalloc_nodump_allocator.cc                             \
-  util/log_buffer.cc                                            \
   util/murmurhash.cc                                            \
   util/random.cc                                                \
   util/rate_limiter.cc                                          \
   util/slice.cc                                                 \
-  util/sst_file_manager_impl.cc                                 \
   util/status.cc                                                \
   util/string_util.cc                                           \
-  util/sync_point.cc                                            \
-  util/sync_point_impl.cc                                       \
   util/thread_local.cc                                          \
   util/threadpool_imp.cc                                        \
-  util/trace_replay.cc                                          \
-  util/transaction_test_util.cc                                 \
   util/xxhash.cc                                                \
   utilities/backupable/backupable_db.cc                         \
   utilities/blob_db/blob_compaction_filter.cc                   \
@@ -185,10 +198,12 @@ LIB_SOURCES =                                                   \
   utilities/memory/memory_util.cc                               \
   utilities/merge_operators/max.cc                              \
   utilities/merge_operators/put.cc                              \
+  utilities/merge_operators/sortlist.cc                  		\
   utilities/merge_operators/string_append/stringappend.cc       \
   utilities/merge_operators/string_append/stringappend2.cc      \
   utilities/merge_operators/uint64add.cc                        \
   utilities/merge_operators/bytesxor.cc                         \
+  utilities/object_registry.cc                                  \
   utilities/option_change_migration/option_change_migration.cc  \
   utilities/options/options_util.cc                             \
   utilities/persistent_cache/block_cache_tier.cc                \
@@ -196,6 +211,7 @@ LIB_SOURCES =                                                   \
   utilities/persistent_cache/block_cache_tier_metadata.cc       \
   utilities/persistent_cache/persistent_cache_tier.cc           \
   utilities/persistent_cache/volatile_tier_impl.cc              \
+  utilities/simulator_cache/cache_simulator.cc                  \
   utilities/simulator_cache/sim_cache.cc                        \
   utilities/table_properties_collectors/compact_on_deletion_collector.cc \
   utilities/trace/file_trace_reader_writer.cc                   \
@@ -216,6 +232,11 @@ LIB_SOURCES =                                                   \
   utilities/write_batch_with_index/write_batch_with_index.cc    \
   utilities/write_batch_with_index/write_batch_with_index_internal.cc    \
 
+ifeq ($(ARMCRC_SOURCE),1)
+LIB_SOURCES +=\
+  util/crc32c_arm64.cc
+endif
+
 ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
 LIB_SOURCES_ASM =\
   util/crc32c_ppc_asm.S
@@ -226,37 +247,48 @@ LIB_SOURCES_ASM =
 LIB_SOURCES_C =
 endif
 
-TOOL_LIB_SOURCES = \
+TOOL_LIB_SOURCES =                                              \
   tools/ldb_cmd.cc                                              \
   tools/ldb_tool.cc                                             \
   tools/sst_dump_tool.cc                                        \
   utilities/blob_db/blob_dump_tool.cc                           \
 
-ANALYZER_LIB_SOURCES = \
-  tools/trace_analyzer_tool.cc					\
+ANALYZER_LIB_SOURCES =                                          \
+  tools/block_cache_analyzer/block_cache_trace_analyzer.cc      \
+  tools/trace_analyzer_tool.cc                                  \
 
-MOCK_LIB_SOURCES = \
-  table/mock_table.cc \
-  util/fault_injection_test_env.cc
+MOCK_LIB_SOURCES =                                              \
+  table/mock_table.cc                                           \
+  test_util/fault_injection_test_env.cc
 
-BENCH_LIB_SOURCES = \
+BENCH_LIB_SOURCES =                                             \
   tools/db_bench_tool.cc                                        \
 
-TEST_LIB_SOURCES = \
+STRESS_LIB_SOURCES =                                            \
+  tools/db_stress_tool.cc                                       \
+
+TEST_LIB_SOURCES =                                              \
   db/db_test_util.cc                                            \
-  util/testharness.cc                                           \
-  util/testutil.cc                                              \
+  test_util/testharness.cc                                      \
+  test_util/testutil.cc                                         \
   utilities/cassandra/test_utils.cc                             \
 
+FOLLY_SOURCES = \
+  third-party/folly/folly/detail/Futex.cpp                                     \
+  third-party/folly/folly/synchronization/AtomicNotification.cpp               \
+  third-party/folly/folly/synchronization/DistributedMutex.cpp                 \
+  third-party/folly/folly/synchronization/ParkingLot.cpp                       \
+  third-party/folly/folly/synchronization/WaitOptions.cpp                      \
+
 MAIN_SOURCES =                                                          \
   cache/cache_bench.cc                                                  \
   cache/cache_test.cc                                                   \
   db/column_family_test.cc                                              \
   db/compact_files_test.cc                                              \
-  db/compaction_iterator_test.cc                                        \
-  db/compaction_job_stats_test.cc                                       \
-  db/compaction_job_test.cc                                             \
-  db/compaction_picker_test.cc                                          \
+  db/compaction/compaction_iterator_test.cc                             \
+  db/compaction/compaction_job_test.cc                                  \
+  db/compaction/compaction_job_stats_test.cc                            \
+  db/compaction/compaction_picker_test.cc                               \
   db/comparator_db_test.cc                                              \
   db/corruption_test.cc                                                 \
   db/cuckoo_table_db_test.cc                                            \
@@ -277,10 +309,11 @@ MAIN_SOURCES =                                                          \
   db/db_log_iter_test.cc                                                \
   db/db_memtable_test.cc                                                \
   db/db_merge_operator_test.cc                                          \
+  db/db_merge_operand_test.cc                                          	\
   db/db_options_test.cc                                                 \
   db/db_properties_test.cc                                              \
   db/db_range_del_test.cc                                               \
-  db/db_secondary_test.cc                                               \
+  db/db_impl/db_secondary_test.cc                                       \
   db/db_sst_test.cc                                                     \
   db/db_statistics_test.cc                                              \
   db/db_table_properties_test.cc                                        \
@@ -293,7 +326,7 @@ MAIN_SOURCES =                                                          \
   db/dbformat_test.cc                                                   \
   db/deletefile_test.cc                                                 \
   db/env_timed_test.cc                                                  \
-  db/error_handler_test.cc                                        	\
+  db/error_handler_test.cc                                              \
   db/external_sst_file_basic_test.cc                                    \
   db/external_sst_file_test.cc                                          \
   db/fault_injection_test.cc                                            \
@@ -314,7 +347,6 @@ MAIN_SOURCES =                                                          \
   db/obsolete_files_test.cc						\
   db/options_settable_test.cc                                           \
   db/options_file_test.cc                                               \
-  db/partitioned_filter_block_test.cc                                   \
   db/perf_context_test.cc                                               \
   db/persistent_cache_test.cc                                           \
   db/plain_table_db_test.cc                                             \
@@ -335,6 +367,10 @@ MAIN_SOURCES =                                                          \
   env/env_basic_test.cc                                                 \
   env/env_test.cc                                                       \
   env/mock_env_test.cc                                                  \
+  logging/auto_roll_logger_test.cc                                      \
+  logging/env_logger_test.cc                                            \
+  logging/event_logger_test.cc                                          \
+  memory/arena_test.cc                                                  \
   memtable/inlineskiplist_test.cc                                       \
   memtable/memtablerep_bench.cc                                         \
   memtable/skiplist_test.cc                                             \
@@ -342,34 +378,37 @@ MAIN_SOURCES =                                                          \
   monitoring/histogram_test.cc                                          \
   monitoring/iostats_context_test.cc                                    \
   monitoring/statistics_test.cc                                         \
+  monitoring/stats_history_test.cc                                      \
   options/options_test.cc                                               \
-  table/block_based_filter_block_test.cc                                \
-  table/block_test.cc                                                   \
+  table/block_based/block_based_filter_block_test.cc                    \
+  table/block_based/block_test.cc                                       \
+  table/block_based/data_block_hash_index_test.cc                       \
+  table/block_based/full_filter_block_test.cc                           \
+  table/block_based/partitioned_filter_block_test.cc                    \
   table/cleanable_test.cc                                               \
-  table/cuckoo_table_builder_test.cc                                    \
-  table/cuckoo_table_reader_test.cc                                     \
-  table/data_block_hash_index_test.cc                                   \
-  table/full_filter_block_test.cc                                       \
+  table/cuckoo/cuckoo_table_builder_test.cc                             \
+  table/cuckoo/cuckoo_table_reader_test.cc                              \
   table/merger_test.cc                                                  \
   table/sst_file_reader_test.cc                                         \
   table/table_reader_bench.cc                                           \
   table/table_test.cc                                                   \
   third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc                  \
+  tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc         \
+  tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc         \
   tools/db_bench.cc                                                     \
   tools/db_bench_tool_test.cc                                           \
   tools/db_sanity_test.cc                                               \
+  tools/db_stress.cc                                                    \
   tools/ldb_cmd_test.cc                                                 \
   tools/reduce_levels_test.cc                                           \
   tools/sst_dump_test.cc                                                \
-  tools/trace_analyzer_test.cc						\
-  util/arena_test.cc                                                    \
-  util/auto_roll_logger_test.cc                                         \
+  tools/trace_analyzer_test.cc				             	\
+  trace_replay/block_cache_tracer_test.cc                               \
   util/autovector_test.cc                                               \
   util/bloom_test.cc                                                    \
   util/coding_test.cc                                                   \
   util/crc32c_test.cc                                                   \
   util/dynamic_bloom_test.cc                                            \
-  util/event_logger_test.cc                                             \
   util/filelock_test.cc                                                 \
   util/log_write_bench.cc                                               \
   util/rate_limiter_test.cc                                             \
@@ -390,6 +429,7 @@ MAIN_SOURCES =                                                          \
   utilities/object_registry_test.cc                                     \
   utilities/option_change_migration/option_change_migration_test.cc     \
   utilities/options/options_util_test.cc                                \
+  utilities/simulator_cache/cache_simulator_test.cc                     \
   utilities/simulator_cache/sim_cache_test.cc                           \
   utilities/table_properties_collectors/compact_on_deletion_collector_test.cc  \
   utilities/transactions/optimistic_transaction_test.cc                 \
@@ -446,6 +486,8 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/snapshot.cc                                   \
   java/rocksjni/sst_file_manager.cc                           \
   java/rocksjni/sst_file_writerjni.cc                         \
+  java/rocksjni/sst_file_readerjni.cc                         \
+  java/rocksjni/sst_file_reader_iterator.cc                   \
   java/rocksjni/statistics.cc                                 \
   java/rocksjni/statisticsjni.cc                              \
   java/rocksjni/table.cc                                      \
diff --git a/table/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc
similarity index 98%
rename from table/adaptive_table_factory.cc
rename to table/adaptive/adaptive_table_factory.cc
index d5dcbc5f585..0086368a9bb 100644
--- a/table/adaptive_table_factory.cc
+++ b/table/adaptive/adaptive_table_factory.cc
@@ -4,7 +4,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #ifndef ROCKSDB_LITE
-#include "table/adaptive_table_factory.h"
+#include "table/adaptive/adaptive_table_factory.h"
 
 #include "table/table_builder.h"
 #include "table/format.h"
diff --git a/table/adaptive_table_factory.h b/table/adaptive/adaptive_table_factory.h
similarity index 100%
rename from table/adaptive_table_factory.h
rename to table/adaptive/adaptive_table_factory.h
diff --git a/table/block.cc b/table/block_based/block.cc
similarity index 93%
rename from table/block.cc
rename to table/block_based/block.cc
index 80bef4a913f..8fa3ff9b986 100644
--- a/table/block.cc
+++ b/table/block_based/block.cc
@@ -9,21 +9,21 @@
 //
 // Decodes the blocks generated by block_builder.cc.
 
-#include "table/block.h"
+#include "table/block_based/block.h"
 #include <algorithm>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
-#include "table/block_prefix_index.h"
-#include "table/data_block_footer.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_footer.h"
 #include "table/format.h"
 #include "util/coding.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
@@ -381,6 +381,7 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
 }
 
 void IndexBlockIter::Seek(const Slice& target) {
+  TEST_SYNC_POINT("IndexBlockIter::Seek:0");
   Slice seek_key = target;
   if (!key_includes_seq_) {
     seek_key = ExtractUserKey(target);
@@ -607,8 +608,7 @@ bool IndexBlockIter::ParseNextIndexKey() {
   }
   // else we are in the middle of a restart interval and the restart_index_
   // thus has not changed
-  if (value_delta_encoded_) {
-    assert(value_length == 0);
+  if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
     DecodeCurrentValue(shared);
   }
   return true;
@@ -626,24 +626,32 @@ bool IndexBlockIter::ParseNextIndexKey() {
 // Otherwise the format is delta-size = block handle size - size of last block
 // handle.
 void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
-  assert(value_delta_encoded_);
-  const char* limit = data_ + restarts_;
-  if (shared == 0) {
-    uint64_t o, s;
-    const char* newp = GetVarint64Ptr(value_.data(), limit, &o);
-    assert(newp);
-    newp = GetVarint64Ptr(newp, limit, &s);
-    assert(newp);
-    decoded_value_ = BlockHandle(o, s);
-    value_ = Slice(value_.data(), newp - value_.data());
-  } else {
-    uint64_t next_value_base =
-        decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize;
-    int64_t delta;
-    const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta);
-    decoded_value_ =
-        BlockHandle(next_value_base, decoded_value_.size() + delta);
-    value_ = Slice(value_.data(), newp - value_.data());
+  Slice v(value_.data(), data_ + restarts_ - value_.data());
+  // Delta encoding is used if `shared` != 0.
+  Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
+      &v, have_first_key_,
+      (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
+  assert(decode_s.ok());
+  value_ = Slice(value_.data(), v.data() - value_.data());
+
+  if (global_seqno_state_ != nullptr) {
+    // Overwrite sequence number the same way as in DataBlockIter.
+
+    IterKey& first_internal_key = global_seqno_state_->first_internal_key;
+    first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
+                                      /* copy */ true);
+
+    assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
+
+    ValueType value_type = ExtractValueType(first_internal_key.GetKey());
+    assert(value_type == ValueType::kTypeValue ||
+           value_type == ValueType::kTypeMerge ||
+           value_type == ValueType::kTypeDeletion ||
+           value_type == ValueType::kTypeRangeDeletion);
+
+    first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
+                                         value_type);
+    decoded_value_.first_internal_key = first_internal_key.GetKey();
   }
 }
 
@@ -874,14 +882,10 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
   }
 }
 
-template <>
-DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
-                                  DataBlockIter* iter, Statistics* stats,
-                                  bool /*total_order_seek*/,
-                                  bool /*key_includes_seq*/,
-                                  bool /*value_is_full*/,
-                                  bool block_contents_pinned,
-                                  BlockPrefixIndex* /*prefix_index*/) {
+DataBlockIter* Block::NewDataIterator(const Comparator* cmp,
+                                      const Comparator* ucmp,
+                                      DataBlockIter* iter, Statistics* stats,
+                                      bool block_contents_pinned) {
   DataBlockIter* ret_iter;
   if (iter != nullptr) {
     ret_iter = iter;
@@ -912,13 +916,11 @@ DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
   return ret_iter;
 }
 
-template <>
-IndexBlockIter* Block::NewIterator(const Comparator* cmp,
-                                   const Comparator* ucmp, IndexBlockIter* iter,
-                                   Statistics* /*stats*/, bool total_order_seek,
-                                   bool key_includes_seq, bool value_is_full,
-                                   bool block_contents_pinned,
-                                   BlockPrefixIndex* prefix_index) {
+IndexBlockIter* Block::NewIndexIterator(
+    const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter,
+    Statistics* /*stats*/, bool total_order_seek, bool have_first_key,
+    bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
+    BlockPrefixIndex* prefix_index) {
   IndexBlockIter* ret_iter;
   if (iter != nullptr) {
     ret_iter = iter;
@@ -937,9 +939,9 @@ IndexBlockIter* Block::NewIterator(const Comparator* cmp,
     BlockPrefixIndex* prefix_index_ptr =
         total_order_seek ? nullptr : prefix_index;
     ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
-                         prefix_index_ptr, key_includes_seq, value_is_full,
-                         block_contents_pinned,
-                         nullptr /* data_block_hash_index */);
+                         global_seqno_, prefix_index_ptr, have_first_key,
+                         key_includes_seq, value_is_full,
+                         block_contents_pinned);
   }
 
   return ret_iter;
diff --git a/table/block.h b/table/block_based/block.h
similarity index 81%
rename from table/block.h
rename to table/block_based/block.h
index df4d4eb82fc..73c21b4659a 100644
--- a/table/block.h
+++ b/table/block_based/block.h
@@ -12,26 +12,20 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
 
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
-#include "format.h"
+#include "port/malloc.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
-#include "table/block_prefix_index.h"
-#include "table/data_block_hash_index.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/data_block_hash_index.h"
+#include "table/format.h"
 #include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
 #include "util/random.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -141,12 +135,27 @@ class BlockReadAmpBitmap {
   uint32_t rnd_;
 };
 
+// This Block class is not for any old block: it is designed to hold only
+// uncompressed blocks containing sorted key-value pairs. It is thus
+// suitable for storing uncompressed data blocks, index blocks (including
+// partitions), range deletion blocks, properties blocks, metaindex blocks,
+// as well as the top level of the partitioned filter structure (which is
+// actually an index of the filter partitions). It is NOT suitable for
+// compressed blocks in general, filter blocks/partitions, or compression
+// dictionaries (since the latter do not contain sorted key-value pairs).
+// Use BlockContents directly for those.
+//
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details of the format and the various block types.
 class Block {
  public:
   // Initialize the block with the specified contents.
   explicit Block(BlockContents&& contents, SequenceNumber _global_seqno,
                  size_t read_amp_bytes_per_bit = 0,
                  Statistics* statistics = nullptr);
+  // No copying allowed
+  Block(const Block&) = delete;
+  void operator=(const Block&) = delete;
 
   ~Block();
 
@@ -165,17 +174,7 @@ class Block {
   // If iter is null, return new Iterator
   // If iter is not null, update this one and return it as Iterator*
   //
-  // key_includes_seq, default true, means that the keys are in internal key
-  // format.
-  // value_is_full, default true, means that no delta encoding is
-  // applied to values.
-  //
-  // NewIterator<DataBlockIter>
-  // Same as above but also updates read_amp_bitmap_ if it is not nullptr.
-  //
-  // NewIterator<IndexBlockIter>
-  // If `prefix_index` is not nullptr this block will do hash lookup for the key
-  // prefix. If total_order_seek is true, prefix_index_ is ignored.
+  // Updates read_amp_bitmap_ if it is not nullptr.
   //
   // If `block_contents_pinned` is true, the caller will guarantee that when
   // the cleanup functions are transferred from the iterator to other
@@ -188,13 +187,32 @@ class Block {
   // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
   // the iterator will simply be set as "invalid", rather than returning
   // the key that is just pass the target key.
-  template <typename TBlockIter>
-  TBlockIter* NewIterator(
-      const Comparator* comparator, const Comparator* user_comparator,
-      TBlockIter* iter = nullptr, Statistics* stats = nullptr,
-      bool total_order_seek = true, bool key_includes_seq = true,
-      bool value_is_full = true, bool block_contents_pinned = false,
-      BlockPrefixIndex* prefix_index = nullptr);
+
+  DataBlockIter* NewDataIterator(const Comparator* comparator,
+                                 const Comparator* user_comparator,
+                                 DataBlockIter* iter = nullptr,
+                                 Statistics* stats = nullptr,
+                                 bool block_contents_pinned = false);
+
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  //
+  // If `prefix_index` is not nullptr this block will do hash lookup for the key
+  // prefix. If total_order_seek is true, prefix_index_ is ignored.
+  //
+  // `have_first_key` controls whether IndexValue will contain
+  // first_internal_key. It affects data serialization format, so the same value
+  // have_first_key must be used when writing and reading index.
+  // It is determined by IndexType property of the table.
+  IndexBlockIter* NewIndexIterator(const Comparator* comparator,
+                                   const Comparator* user_comparator,
+                                   IndexBlockIter* iter, Statistics* stats,
+                                   bool total_order_seek, bool have_first_key,
+                                   bool key_includes_seq, bool value_is_full,
+                                   bool block_contents_pinned = false,
+                                   BlockPrefixIndex* prefix_index = nullptr);
 
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const;
@@ -213,10 +231,6 @@ class Block {
   const SequenceNumber global_seqno_;
 
   DataBlockHashIndex data_block_hash_index_;
-
-  // No copying allowed
-  Block(const Block&) = delete;
-  void operator=(const Block&) = delete;
 };
 
 template <class TValue>
@@ -236,6 +250,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
     restart_index_ = num_restarts_;
     global_seqno_ = global_seqno;
     block_contents_pinned_ = block_contents_pinned;
+    cache_handle_ = nullptr;
   }
 
   // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
@@ -285,6 +300,10 @@ class BlockIter : public InternalIteratorBase<TValue> {
     return static_cast<uint32_t>(value_.data() - data_);
   }
 
+  void SetCacheHandle(Cache::Handle* handle) { cache_handle_ = handle; }
+
+  Cache::Handle* cache_handle() { return cache_handle_; }
+
  protected:
   // Note: The type could be changed to InternalKeyComparator but we see a weird
   // performance drop by that.
@@ -307,6 +326,14 @@ class BlockIter : public InternalIteratorBase<TValue> {
   bool block_contents_pinned_;
   SequenceNumber global_seqno_;
 
+ private:
+  // Store the cache handle, if the block is cached. We need this since the
+  // only other place the handle is stored is as an argument to the Cleanable
+  // function callback, which is hard to retrieve. When multiple value
+  // PinnableSlices reference the block, they need the cache handle in order
+  // to bump up the ref count
+  Cache::Handle* cache_handle_;
+
  public:
   // Return the offset in data_ just past the end of the current entry.
   inline uint32_t NextEntryOffset() const {
@@ -458,7 +485,7 @@ class DataBlockIter final : public BlockIter<Slice> {
   bool SeekForGetImpl(const Slice& target);
 };
 
-class IndexBlockIter final : public BlockIter<BlockHandle> {
+class IndexBlockIter final : public BlockIter<IndexValue> {
  public:
   IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
 
@@ -470,23 +497,12 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
   // format.
   // value_is_full, default true, means that no delta encoding is
   // applied to values.
-  IndexBlockIter(const Comparator* comparator,
-                 const Comparator* user_comparator, const char* data,
-                 uint32_t restarts, uint32_t num_restarts,
-                 BlockPrefixIndex* prefix_index, bool key_includes_seq,
-                 bool value_is_full, bool block_contents_pinned)
-      : IndexBlockIter() {
-    Initialize(comparator, user_comparator, data, restarts, num_restarts,
-               prefix_index, key_includes_seq, block_contents_pinned,
-               value_is_full, nullptr /* data_block_hash_index */);
-  }
-
   void Initialize(const Comparator* comparator,
                   const Comparator* user_comparator, const char* data,
                   uint32_t restarts, uint32_t num_restarts,
-                  BlockPrefixIndex* prefix_index, bool key_includes_seq,
-                  bool value_is_full, bool block_contents_pinned,
-                  DataBlockHashIndex* /*data_block_hash_index*/) {
+                  SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
+                  bool have_first_key, bool key_includes_seq,
+                  bool value_is_full, bool block_contents_pinned) {
     InitializeBase(key_includes_seq ? comparator : user_comparator, data,
                    restarts, num_restarts, kDisableGlobalSequenceNumber,
                    block_contents_pinned);
@@ -494,6 +510,12 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
     key_.SetIsUserKey(!key_includes_seq_);
     prefix_index_ = prefix_index;
     value_delta_encoded_ = !value_is_full;
+    have_first_key_ = have_first_key;
+    if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
+      global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
+    } else {
+      global_seqno_state_.reset();
+    }
   }
 
   Slice user_key() const override {
@@ -503,16 +525,17 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
     return key();
   }
 
-  virtual BlockHandle value() const override {
+  virtual IndexValue value() const override {
     assert(Valid());
-    if (value_delta_encoded_) {
+    if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
       return decoded_value_;
     } else {
-      BlockHandle handle;
+      IndexValue entry;
       Slice v = value_;
-      Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v);
+      Status decode_s __attribute__((__unused__)) =
+          entry.DecodeFrom(&v, have_first_key_, nullptr);
       assert(decode_s.ok());
-      return handle;
+      return entry;
     }
   }
 
@@ -539,10 +562,15 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
 
   void Invalidate(Status s) { InvalidateBase(s); }
 
+  bool IsValuePinned() const override {
+    return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned();
+  }
+
  private:
   // Key is in InternalKey format
   bool key_includes_seq_;
   bool value_delta_encoded_;
+  bool have_first_key_;  // value includes first_internal_key
   BlockPrefixIndex* prefix_index_;
   // Whether the value is delta encoded. In that case the value is assumed to be
   // BlockHandle. The first value in each restart interval is the full encoded
@@ -550,7 +578,22 @@ class IndexBlockIter final : public BlockIter<BlockHandle> {
   // offset of delta encoded BlockHandles is computed by adding the size of
   // previous delta encoded values in the same restart interval to the offset of
   // the first value in that restart interval.
-  BlockHandle decoded_value_;
+  IndexValue decoded_value_;
+
+  // When sequence number overwriting is enabled, this struct contains the seqno
+  // to overwrite with, and current first_internal_key with overwritten seqno.
+  // This is rarely used, so we put it behind a pointer and only allocate when
+  // needed.
+  struct GlobalSeqnoState {
+    // First internal key according to current index entry, but with sequence
+    // number overwritten to global_seqno.
+    IterKey first_internal_key;
+    SequenceNumber global_seqno;
+
+    explicit GlobalSeqnoState(SequenceNumber seqno) : global_seqno(seqno) {}
+  };
+
+  std::unique_ptr<GlobalSeqnoState> global_seqno_state_;
 
   bool PrefixSeek(const Slice& target, uint32_t* index);
   bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
diff --git a/table/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc
similarity index 57%
rename from table/block_based_filter_block.cc
rename to table/block_based/block_based_filter_block.cc
index 81087b243b7..319c5bf6d87 100644
--- a/table/block_based_filter_block.cc
+++ b/table/block_based/block_based_filter_block.cc
@@ -7,12 +7,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/block_based_filter_block.h"
+#include "table/block_based/block_based_filter_block.h"
 #include <algorithm>
 
 #include "db/dbformat.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "util/coding.h"
 #include "util/string_util.h"
 
@@ -162,56 +163,120 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() {
 }
 
 BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
-    const SliceTransform* prefix_extractor,
-    const BlockBasedTableOptions& table_opt, bool _whole_key_filtering,
-    BlockContents&& contents, Statistics* stats)
-    : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
-      policy_(table_opt.filter_policy.get()),
-      prefix_extractor_(prefix_extractor),
-      data_(nullptr),
-      offset_(nullptr),
-      num_(0),
-      base_lg_(0),
-      contents_(std::move(contents)) {
-  assert(policy_);
-  size_t n = contents_.data.size();
-  if (n < 5) return;  // 1 byte for base_lg_ and 4 for start of offset array
-  base_lg_ = contents_.data[n - 1];
-  uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5);
-  if (last_word > n - 5) return;
-  data_ = contents_.data.data();
-  offset_ = data_ + last_word;
-  num_ = (n - 5 - last_word) / 4;
+    const BlockBasedTable* t, CachableEntry<BlockContents>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {
+  assert(table());
+  assert(table()->get_rep());
+  assert(table()->get_rep()->filter_policy);
+}
+
+std::unique_ptr<FilterBlockReader> BlockBasedFilterBlockReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<BlockContents> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+                                     use_cache, nullptr /* get_context */,
+                                     lookup_context, &filter_block);
+    if (!s.ok()) {
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new BlockBasedFilterBlockReader(table, std::move(filter_block)));
 }
 
 bool BlockBasedFilterBlockReader::KeyMayMatch(
     const Slice& key, const SliceTransform* /* prefix_extractor */,
-    uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/) {
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
   assert(block_offset != kNotValid);
-  if (!whole_key_filtering_) {
+  if (!whole_key_filtering()) {
     return true;
   }
-  return MayMatch(key, block_offset);
+  return MayMatch(key, block_offset, no_io, get_context, lookup_context);
 }
 
 bool BlockBasedFilterBlockReader::PrefixMayMatch(
     const Slice& prefix, const SliceTransform* /* prefix_extractor */,
-    uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/) {
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
   assert(block_offset != kNotValid);
-  return MayMatch(prefix, block_offset);
-}
-
-bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
-                                           uint64_t block_offset) {
-  uint64_t index = block_offset >> base_lg_;
-  if (index < num_) {
-    uint32_t start = DecodeFixed32(offset_ + index * 4);
-    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
-    if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
-      Slice filter = Slice(data_ + start, limit - start);
-      bool const may_match = policy_->KeyMayMatch(entry, filter);
+  return MayMatch(prefix, block_offset, no_io, get_context, lookup_context);
+}
+
+bool BlockBasedFilterBlockReader::ParseFieldsFromBlock(
+    const BlockContents& contents, const char** data, const char** offset,
+    size_t* num, size_t* base_lg) {
+  assert(data);
+  assert(offset);
+  assert(num);
+  assert(base_lg);
+
+  const size_t n = contents.data.size();
+  if (n < 5) {  // 1 byte for base_lg and 4 for start of offset array
+    return false;
+  }
+
+  const uint32_t last_word = DecodeFixed32(contents.data.data() + n - 5);
+  if (last_word > n - 5) {
+    return false;
+  }
+
+  *data = contents.data.data();
+  *offset = (*data) + last_word;
+  *num = (n - 5 - last_word) / 4;
+  *base_lg = contents.data[n - 1];
+
+  return true;
+}
+
+bool BlockBasedFilterBlockReader::MayMatch(
+    const Slice& entry, uint64_t block_offset, bool no_io,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) const {
+  CachableEntry<BlockContents> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+  if (!s.ok()) {
+    return true;
+  }
+
+  assert(filter_block.GetValue());
+
+  const char* data = nullptr;
+  const char* offset = nullptr;
+  size_t num = 0;
+  size_t base_lg = 0;
+  if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
+                            &base_lg)) {
+    return true;  // Errors are treated as potential matches
+  }
+
+  const uint64_t index = block_offset >> base_lg;
+  if (index < num) {
+    const uint32_t start = DecodeFixed32(offset + index * 4);
+    const uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
+    if (start <= limit && limit <= (uint32_t)(offset - data)) {
+      const Slice filter = Slice(data + start, limit - start);
+
+      assert(table());
+      assert(table()->get_rep());
+      const FilterPolicy* const policy = table()->get_rep()->filter_policy;
+
+      const bool may_match = policy->KeyMayMatch(entry, filter);
       if (may_match) {
         PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
         return true;
@@ -228,27 +293,54 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
 }
 
 size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
-  return num_ * 4 + 5 + (offset_ - data_);
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<BlockBasedFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
 }
 
 std::string BlockBasedFilterBlockReader::ToString() const {
+  CachableEntry<BlockContents> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+                           nullptr /* lookup_context */, &filter_block);
+  if (!s.ok()) {
+    return std::string("Unable to retrieve filter block");
+  }
+
+  assert(filter_block.GetValue());
+
+  const char* data = nullptr;
+  const char* offset = nullptr;
+  size_t num = 0;
+  size_t base_lg = 0;
+  if (!ParseFieldsFromBlock(*filter_block.GetValue(), &data, &offset, &num,
+                            &base_lg)) {
+    return std::string("Error parsing filter block");
+  }
+
   std::string result;
   result.reserve(1024);
 
   std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
-  AppendItem(&result, s_fb, rocksdb::ToString(num_));
+  AppendItem(&result, s_fb, rocksdb::ToString(num));
   AppendItem(&result, s_bo, s_hd);
 
-  for (size_t index = 0; index < num_; index++) {
-    uint32_t start = DecodeFixed32(offset_ + index * 4);
-    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
+  for (size_t index = 0; index < num; index++) {
+    uint32_t start = DecodeFixed32(offset + index * 4);
+    uint32_t limit = DecodeFixed32(offset + index * 4 + 4);
 
     if (start != limit) {
       result.append(" filter block # " + rocksdb::ToString(index + 1) + "\n");
-      Slice filter = Slice(data_ + start, limit - start);
+      Slice filter = Slice(data + start, limit - start);
       AppendItem(&result, start, filter.ToString(true));
     }
   }
   return result;
 }
+
 }  // namespace rocksdb
diff --git a/table/block_based_filter_block.h b/table/block_based/block_based_filter_block.h
similarity index 64%
rename from table/block_based_filter_block.h
rename to table/block_based/block_based_filter_block.h
index d1ff585462a..ed409e041ee 100644
--- a/table/block_based_filter_block.h
+++ b/table/block_based/block_based_filter_block.h
@@ -18,10 +18,12 @@
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "table/filter_block.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/format.h"
 #include "util/hash.h"
 
 namespace rocksdb {
@@ -36,6 +38,9 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
  public:
   BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
                                const BlockBasedTableOptions& table_opt);
+  // No copying allowed
+  BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&) = delete;
+  void operator=(const BlockBasedFilterBlockBuilder&) = delete;
 
   virtual bool IsBlockBased() override { return true; }
   virtual void StartBlock(uint64_t block_offset) override;
@@ -66,49 +71,49 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
   std::vector<Slice> tmp_entries_;  // policy_->CreateFilter() argument
   std::vector<uint32_t> filter_offsets_;
   size_t num_added_;  // Number of keys added
-
-  // No copying allowed
-  BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&);
-  void operator=(const BlockBasedFilterBlockBuilder&);
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
 // KeyMayMatch and PrefixMayMatch would trigger filter checking
-class BlockBasedFilterBlockReader : public FilterBlockReader {
+class BlockBasedFilterBlockReader
+    : public FilterBlockReaderCommon<BlockContents> {
  public:
-  // REQUIRES: "contents" and *policy must stay live while *this is live.
-  BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
-                              const BlockBasedTableOptions& table_opt,
-                              bool whole_key_filtering,
-                              BlockContents&& contents, Statistics* statistics);
-  virtual bool IsBlockBased() override { return true; }
+  BlockBasedFilterBlockReader(const BlockBasedTable* t,
+                              CachableEntry<BlockContents>&& filter_block);
+  // No copying allowed
+  BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&) = delete;
+  void operator=(const BlockBasedFilterBlockReader&) = delete;
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
 
-  virtual bool KeyMayMatch(
-      const Slice& key, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-  virtual bool PrefixMayMatch(
-      const Slice& prefix, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-  virtual size_t ApproximateMemoryUsage() const override;
+  bool IsBlockBased() override { return true; }
+
+  bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+                   uint64_t block_offset, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context) override;
+  bool PrefixMayMatch(const Slice& prefix,
+                      const SliceTransform* prefix_extractor,
+                      uint64_t block_offset, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context) override;
+  size_t ApproximateMemoryUsage() const override;
 
   // convert this object to a human readable form
   std::string ToString() const override;
 
  private:
-  const FilterPolicy* policy_;
-  const SliceTransform* prefix_extractor_;
-  const char* data_;    // Pointer to filter data (at block-start)
-  const char* offset_;  // Pointer to beginning of offset array (at block-end)
-  size_t num_;          // Number of entries in offset array
-  size_t base_lg_;      // Encoding parameter (see kFilterBaseLg in .cc file)
-  BlockContents contents_;
+  static bool ParseFieldsFromBlock(const BlockContents& contents,
+                                   const char** data, const char** offset,
+                                   size_t* num, size_t* base_lg);
 
-  bool MayMatch(const Slice& entry, uint64_t block_offset);
-
-  // No copying allowed
-  BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&);
-  void operator=(const BlockBasedFilterBlockReader&);
+  bool MayMatch(const Slice& entry, uint64_t block_offset, bool no_io,
+                GetContext* get_context,
+                BlockCacheLookupContext* lookup_context) const;
 };
+
 }  // namespace rocksdb
diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc
new file mode 100644
index 00000000000..677203ae24d
--- /dev/null
+++ b/table/block_based/block_based_filter_block_test.cc
@@ -0,0 +1,434 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_based/block_based_filter_block.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+// For testing: emit an array with one hash value per key
+class TestHashFilter : public FilterPolicy {
+ public:
+  const char* Name() const override { return "TestHashFilter"; }
+
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class FilterBlockTest : public mock::MockBlockBasedTableTester,
+                        public testing::Test {
+ public:
+  FilterBlockTest() : mock::MockBlockBasedTableTester(new TestHashFilter) {}
+};
+
+TEST_F(FilterBlockTest, EmptyBuilder) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  Slice slice(builder.Finish());
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice));
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+TEST_F(FilterBlockTest, SingleChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  ASSERT_EQ(0, builder.NumAdded());
+  builder.StartBlock(100);
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.StartBlock(200);
+  builder.Add("box");
+  builder.StartBlock(300);
+  builder.Add("hello");
+  ASSERT_EQ(5, builder.NumAdded());
+  Slice slice(builder.Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+TEST_F(FilterBlockTest, MultiChunk) {
+  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+
+  // First filter
+  builder.StartBlock(0);
+  builder.Add("foo");
+  builder.StartBlock(2000);
+  builder.Add("bar");
+
+  // Second filter
+  builder.StartBlock(3100);
+  builder.Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder.StartBlock(9000);
+  builder.Add("box");
+  builder.Add("hello");
+
+  Slice slice(builder.Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  BlockBasedFilterBlockReader reader(table_.get(), std::move(block));
+
+  // Check first filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/uint64_t{0},
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/2000,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check second filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/3100,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check last filter
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/9000,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/9000,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+// Test for block based filter block
+// use new interface in FilterPolicy to create filter builder/reader
+class BlockBasedFilterBlockTest : public mock::MockBlockBasedTableTester,
+                                  public testing::Test {
+ public:
+  BlockBasedFilterBlockTest()
+      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, true)) {}
+};
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
+  FilterBlockBuilder* builder =
+      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+  Slice slice(builder->Finish());
+  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(slice));
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FilterBlockReader* reader =
+      new BlockBasedFilterBlockReader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/10000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  delete builder;
+  delete reader;
+}
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
+  FilterBlockBuilder* builder =
+      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+  builder->StartBlock(100);
+  builder->Add("foo");
+  builder->Add("bar");
+  builder->Add("box");
+  builder->StartBlock(200);
+  builder->Add("box");
+  builder->StartBlock(300);
+  builder->Add("hello");
+  Slice slice(builder->Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FilterBlockReader* reader =
+      new BlockBasedFilterBlockReader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  delete builder;
+  delete reader;
+}
+
+TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
+  FilterBlockBuilder* builder =
+      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+
+  // First filter
+  builder->StartBlock(0);
+  builder->Add("foo");
+  builder->StartBlock(2000);
+  builder->Add("bar");
+
+  // Second filter
+  builder->StartBlock(3100);
+  builder->Add("box");
+
+  // Third filter is empty
+
+  // Last filter
+  builder->StartBlock(9000);
+  builder->Add("box");
+  builder->Add("hello");
+
+  Slice slice(builder->Finish());
+
+  CachableEntry<BlockContents> block(
+      new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
+      true /* own_value */);
+
+  FilterBlockReader* reader =
+      new BlockBasedFilterBlockReader(table_.get(), std::move(block));
+
+  // Check first filter
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/2000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/uint64_t{0},
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check second filter
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/3100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check third filter (empty)
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/4100,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  // Check last filter
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "box", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader->KeyMayMatch(
+      "hello", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "foo", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader->KeyMayMatch(
+      "bar", /*prefix_extractor=*/nullptr, /*block_offset=*/9000,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+
+  delete builder;
+  delete reader;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/table/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
similarity index 94%
rename from table/block_based_table_builder.cc
rename to table/block_based/block_based_table_builder.cc
index 9a1742e5f3a..58f3ff4339a 100644
--- a/table/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/block_based_table_builder.h"
+#include "table/block_based/block_based_table_builder.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -20,36 +20,35 @@
 #include <utility>
 
 #include "db/dbformat.h"
+#include "index_builder.h"
 
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
-#include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/table.h"
 
-#include "table/block.h"
-#include "table/block_based_filter_block.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_based_table_reader.h"
-#include "table/block_builder.h"
-#include "table/filter_block.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
 #include "table/format.h"
-#include "table/full_filter_block.h"
 #include "table/table_builder.h"
 
+#include "memory/memory_allocator.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
-#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
 
-#include "table/index_builder.h"
-#include "table/partitioned_filter_block.h"
-
 namespace rocksdb {
 
 extern const std::string kHashIndexPrefixesBlock;
@@ -63,13 +62,14 @@ namespace {
 // Create a filter block builder based on its type.
 FilterBlockBuilder* CreateFilterBlockBuilder(
     const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
-    const BlockBasedTableOptions& table_opt,
+    const FilterBuildingContext& context,
     const bool use_delta_encoding_for_index_values,
     PartitionedIndexBuilder* const p_index_builder) {
+  const BlockBasedTableOptions& table_opt = context.table_options;
   if (table_opt.filter_policy == nullptr) return nullptr;
 
   FilterBitsBuilder* filter_bits_builder =
-      table_opt.filter_policy->GetFilterBitsBuilder();
+      BloomFilterPolicy::GetBuilderFromContext(context);
   if (filter_bits_builder == nullptr) {
     return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(),
                                             table_opt);
@@ -346,6 +346,7 @@ struct BlockBasedTableBuilder::Rep {
 
   std::string compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
+  int level_at_creation;
   uint32_t column_family_id;
   const std::string& column_family_name;
   uint64_t creation_time = 0;
@@ -364,9 +365,9 @@ struct BlockBasedTableBuilder::Rep {
       const CompressionType _compression_type,
       const uint64_t _sample_for_compression,
       const CompressionOptions& _compression_opts, const bool skip_filters,
-      const std::string& _column_family_name, const uint64_t _creation_time,
-      const uint64_t _oldest_key_time, const uint64_t _target_file_size,
-      const uint64_t _file_creation_time)
+      const int _level_at_creation, const std::string& _column_family_name,
+      const uint64_t _creation_time, const uint64_t _oldest_key_time,
+      const uint64_t _target_file_size, const uint64_t _file_creation_time)
       : ioptions(_ioptions),
         moptions(_moptions),
         table_options(table_opt),
@@ -399,6 +400,7 @@ struct BlockBasedTableBuilder::Rep {
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
                 table_options, data_block)),
+        level_at_creation(_level_at_creation),
         column_family_id(_column_family_id),
         column_family_name(_column_family_name),
         creation_time(_creation_time),
@@ -420,9 +422,13 @@ struct BlockBasedTableBuilder::Rep {
     if (skip_filters) {
       filter_builder = nullptr;
     } else {
+      FilterBuildingContext context(table_options);
+      context.column_family_name = column_family_name;
+      context.compaction_style = ioptions.compaction_style;
+      context.level_at_creation = level_at_creation;
       filter_builder.reset(CreateFilterBlockBuilder(
-          _ioptions, _moptions, table_options,
-          use_delta_encoding_for_index_values, p_index_builder_));
+          ioptions, moptions, context, use_delta_encoding_for_index_values,
+          p_index_builder_));
     }
 
     for (auto& collector_factories : *int_tbl_prop_collector_factories) {
@@ -455,9 +461,9 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     const CompressionType compression_type,
     const uint64_t sample_for_compression,
     const CompressionOptions& compression_opts, const bool skip_filters,
-    const std::string& column_family_name, const uint64_t creation_time,
-    const uint64_t oldest_key_time, const uint64_t target_file_size,
-    const uint64_t file_creation_time) {
+    const std::string& column_family_name, const int level_at_creation,
+    const uint64_t creation_time, const uint64_t oldest_key_time,
+    const uint64_t target_file_size, const uint64_t file_creation_time) {
   BlockBasedTableOptions sanitized_table_options(table_options);
   if (sanitized_table_options.format_version == 0 &&
       sanitized_table_options.checksum != kCRC32c) {
@@ -470,12 +476,12 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     sanitized_table_options.format_version = 1;
   }
 
-  rep_ =
-      new Rep(ioptions, moptions, sanitized_table_options, internal_comparator,
-              int_tbl_prop_collector_factories, column_family_id, file,
-              compression_type, sample_for_compression, compression_opts,
-              skip_filters, column_family_name, creation_time, oldest_key_time,
-              target_file_size, file_creation_time);
+  rep_ = new Rep(ioptions, moptions, sanitized_table_options,
+                 internal_comparator, int_tbl_prop_collector_factories,
+                 column_family_id, file, compression_type,
+                 sample_for_compression, compression_opts, skip_filters,
+                 level_at_creation, column_family_name, creation_time,
+                 oldest_key_time, target_file_size, file_creation_time);
 
   if (rep_->filter_builder != nullptr) {
     rep_->filter_builder->StartBlock(0);
@@ -532,7 +538,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
     // Note: PartitionedFilterBlockBuilder requires key being added to filter
     // builder after being added to index builder.
     if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) {
-      r->filter_builder->Add(ExtractUserKey(key));
+      size_t ts_sz = r->internal_comparator.user_comparator()->timestamp_size();
+      r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz));
     }
 
     r->last_key.assign(key.data(), key.size());
@@ -733,11 +740,13 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
         break;
       }
       case kxxHash: {
-        void* xxh = XXH32_init(0);
-        XXH32_update(xxh, block_contents.data(),
+        XXH32_state_t* const state = XXH32_createState();
+        XXH32_reset(state, 0);
+        XXH32_update(state, block_contents.data(),
                      static_cast<uint32_t>(block_contents.size()));
-        XXH32_update(xxh, trailer, 1);  // Extend  to cover block type
-        EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
+        XXH32_update(state, trailer, 1);  // Extend  to cover block type
+        EncodeFixed32(trailer_without_type, XXH32_digest(state));
+        XXH32_freeState(state);
         break;
       }
       case kxxHash64: {
diff --git a/table/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
similarity index 96%
rename from table/block_based_table_builder.h
rename to table/block_based/block_based_table_builder.h
index a1ef3889112..5dd5065bb20 100644
--- a/table/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -47,17 +47,18 @@ class BlockBasedTableBuilder : public TableBuilder {
       const CompressionType compression_type,
       const uint64_t sample_for_compression,
       const CompressionOptions& compression_opts, const bool skip_filters,
-      const std::string& column_family_name, const uint64_t creation_time = 0,
-      const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0,
+      const std::string& column_family_name, const int level_at_creation,
+      const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
+      const uint64_t target_file_size = 0,
       const uint64_t file_creation_time = 0);
 
-  // REQUIRES: Either Finish() or Abandon() has been called.
-  ~BlockBasedTableBuilder();
-
   // No copying allowed
   BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
   BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete;
 
+  // REQUIRES: Either Finish() or Abandon() has been called.
+  ~BlockBasedTableBuilder();
+
   // Add key,value to the table being constructed.
   // REQUIRES: key is after any previously added key according to comparator.
   // REQUIRES: Finish(), Abandon() have not been called
diff --git a/table/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
similarity index 95%
rename from table/block_based_table_factory.cc
rename to table/block_based/block_based_table_factory.cc
index 47fe8e1b0e3..a3368610dc8 100644
--- a/table/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -7,14 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/block_based_table_factory.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <stdint.h>
+#include <cinttypes>
 
 #include <memory>
 #include <string>
@@ -24,8 +18,9 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/flush_block_policy.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_based_table_reader.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/format.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
@@ -172,7 +167,12 @@ BlockBasedTableFactory::BlockBasedTableFactory(
   if (table_options_.no_block_cache) {
     table_options_.block_cache.reset();
   } else if (table_options_.block_cache == nullptr) {
-    table_options_.block_cache = NewLRUCache(8 << 20);
+    LRUCacheOptions co;
+    co.capacity = 8 << 20;
+    // It makes little sense to pay overhead for mid-point insertion while the
+    // block size is only 8MB.
+    co.high_pri_pool_ratio = 0.0;
+    table_options_.block_cache = NewLRUCache(co);
   }
   if (table_options_.block_size_deviation < 0 ||
       table_options_.block_size_deviation > 100) {
@@ -203,7 +203,8 @@ Status BlockBasedTableFactory::NewTableReader(
       file_size, table_reader, table_reader_options.prefix_extractor,
       prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
       table_reader_options.level, table_reader_options.immortal,
-      table_reader_options.largest_seqno, &tail_prefetch_stats_);
+      table_reader_options.largest_seqno, &tail_prefetch_stats_,
+      table_reader_options.block_cache_tracer);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
@@ -217,7 +218,7 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder(
       table_builder_options.sample_for_compression,
       table_builder_options.compression_opts,
       table_builder_options.skip_filters,
-      table_builder_options.column_family_name,
+      table_builder_options.column_family_name, table_builder_options.level,
       table_builder_options.creation_time,
       table_builder_options.oldest_key_time,
       table_builder_options.target_file_size,
@@ -227,7 +228,7 @@ TableBuilder* BlockBasedTableFactory::NewTableBuilder(
 }
 
 Status BlockBasedTableFactory::SanitizeOptions(
-    const DBOptions& /*db_opts*/, const ColumnFamilyOptions& cf_opts) const {
+    const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const {
   if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
       cf_opts.prefix_extractor == nullptr) {
     return Status::InvalidArgument(
@@ -261,6 +262,10 @@ Status BlockBasedTableFactory::SanitizeOptions(
     return Status::InvalidArgument(
         "Block alignment requested but block size is not a power of 2");
   }
+  if (table_options_.block_size > port::kMaxUint32) {
+    return Status::InvalidArgument(
+        "block size exceeds maximum number (4GiB) allowed");
+  }
   if (table_options_.data_block_index_type ==
           BlockBasedTableOptions::kDataBlockBinaryAndHash &&
       table_options_.data_block_hash_table_util_ratio <= 0) {
@@ -268,6 +273,12 @@ Status BlockBasedTableFactory::SanitizeOptions(
         "data_block_hash_table_util_ratio should be greater than 0 when "
         "data_block_index_type is set to kDataBlockBinaryAndHash");
   }
+  if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
+    // TODO(myabandeh): support it
+    return Status::InvalidArgument(
+        "max_successive_merges larger than 0 is currently inconsistent with "
+        "unordered_write");
+  }
   return Status::OK();
 }
 
@@ -499,8 +510,8 @@ std::string ParseBlockBasedTableOption(const std::string& name,
       if (pos == std::string::npos) {
         return "Invalid filter policy config, missing bits_per_key";
       }
-      int bits_per_key =
-          ParseInt(trim(value.substr(kName.size(), pos - kName.size())));
+      double bits_per_key =
+          ParseDouble(trim(value.substr(kName.size(), pos - kName.size())));
       bool use_block_based_builder =
           ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1)));
       new_options->filter_policy.reset(
diff --git a/table/block_based_table_factory.h b/table/block_based/block_based_table_factory.h
similarity index 100%
rename from table/block_based_table_factory.h
rename to table/block_based/block_based_table_factory.h
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
new file mode 100644
index 00000000000..ffdbdb1bd5d
--- /dev/null
+++ b/table/block_based/block_based_table_reader.cc
@@ -0,0 +1,4409 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "table/block_based/block_based_table_reader.h"
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_prefix_index.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/block_fetcher.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/persistent_cache_helper.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/two_level_iterator.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace rocksdb {
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
+
+typedef BlockBasedTable::IndexReader IndexReader;
+
+// Found that 256 KB readahead size provides the best performance, based on
+// experiments, for auto readahead. Experiment data is in PR #3282.
+const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
+
+BlockBasedTable::~BlockBasedTable() {
+  delete rep_;
+}
+
+std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
+
+template <typename TBlocklike>
+class BlocklikeTraits;
+
+template <>
+class BlocklikeTraits<BlockContents> {
+ public:
+  static BlockContents* Create(BlockContents&& contents,
+                               SequenceNumber /* global_seqno */,
+                               size_t /* read_amp_bytes_per_bit */,
+                               Statistics* /* statistics */,
+                               bool /* using_zstd */,
+                               const FilterPolicy* /* filter_policy */) {
+    return new BlockContents(std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
+    return 0;
+  }
+};
+
+template <>
+class BlocklikeTraits<ParsedFullFilterBlock> {
+ public:
+  static ParsedFullFilterBlock* Create(BlockContents&& contents,
+                                       SequenceNumber /* global_seqno */,
+                                       size_t /* read_amp_bytes_per_bit */,
+                                       Statistics* /* statistics */,
+                                       bool /* using_zstd */,
+                                       const FilterPolicy* filter_policy) {
+    return new ParsedFullFilterBlock(filter_policy, std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
+    return 0;
+  }
+};
+
+template <>
+class BlocklikeTraits<Block> {
+ public:
+  static Block* Create(BlockContents&& contents, SequenceNumber global_seqno,
+                       size_t read_amp_bytes_per_bit, Statistics* statistics,
+                       bool /* using_zstd */,
+                       const FilterPolicy* /* filter_policy */) {
+    return new Block(std::move(contents), global_seqno, read_amp_bytes_per_bit,
+                     statistics);
+  }
+
+  static uint32_t GetNumRestarts(const Block& block) {
+    return block.NumRestarts();
+  }
+};
+
+template <>
+class BlocklikeTraits<UncompressionDict> {
+ public:
+  static UncompressionDict* Create(BlockContents&& contents,
+                                   SequenceNumber /* global_seqno */,
+                                   size_t /* read_amp_bytes_per_bit */,
+                                   Statistics* /* statistics */,
+                                   bool using_zstd,
+                                   const FilterPolicy* /* filter_policy */) {
+    return new UncompressionDict(contents.data, std::move(contents.allocation),
+                                 using_zstd);
+  }
+
+  static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
+    return 0;
+  }
+};
+
+namespace {
+// Read the block identified by "handle" from "file".
+// The only relevant option is options.verify_checksums for now.
+// On failure return non-OK.
+// On success fill *result and return OK - caller owns *result
+// @param uncompression_dict Data for presetting the compression library's
+//    dictionary.
+template <typename TBlocklike>
+Status ReadBlockFromFile(
+    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
+    std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
+    bool do_uncompress, bool maybe_compressed, BlockType block_type,
+    const UncompressionDict& uncompression_dict,
+    const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
+    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
+    bool for_compaction, bool using_zstd, const FilterPolicy* filter_policy) {
+  assert(result);
+
+  BlockContents contents;
+  BlockFetcher block_fetcher(
+      file, prefetch_buffer, footer, options, handle, &contents, ioptions,
+      do_uncompress, maybe_compressed, block_type, uncompression_dict,
+      cache_options, memory_allocator, nullptr, for_compaction);
+  Status s = block_fetcher.ReadBlockContents();
+  if (s.ok()) {
+    result->reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(contents), global_seqno, read_amp_bytes_per_bit,
+        ioptions.statistics, using_zstd, filter_policy));
+  }
+
+  return s;
+}
+
+inline MemoryAllocator* GetMemoryAllocator(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache.get()
+             ? table_options.block_cache->memory_allocator()
+             : nullptr;
+}
+
+inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache_compressed.get()
+             ? table_options.block_cache_compressed->memory_allocator()
+             : nullptr;
+}
+
+// Delete the entry resided in the cache.
+template <class Entry>
+void DeleteCachedEntry(const Slice& /*key*/, void* value) {
+  auto entry = reinterpret_cast<Entry*>(value);
+  delete entry;
+}
+
+// Release the cached entry and decrement its ref count.
+void ForceReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle, true /* force_erase */);
+}
+
+// Release the cached entry and decrement its ref count.
+// Do not force erase
+void ReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle, false /* force_erase */);
+}
+
+// For hash based index, return true if prefix_extractor and
+// prefix_extractor_block mismatch, false otherwise. This flag will be used
+// as total_order_seek via NewIndexIterator
+bool PrefixExtractorChanged(const TableProperties* table_properties,
+                            const SliceTransform* prefix_extractor) {
+  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
+  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
+  // is set but prefix_extractor_block is not set, also disable hash index
+  if (prefix_extractor == nullptr || table_properties == nullptr ||
+      table_properties->prefix_extractor_name.empty()) {
+    return true;
+  }
+
+  // prefix_extractor and prefix_extractor_block are both non-empty
+  if (table_properties->prefix_extractor_name.compare(
+          prefix_extractor->Name()) != 0) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
+  CacheAllocationPtr heap_buf;
+  heap_buf = AllocateBlock(buf.size(), allocator);
+  memcpy(heap_buf.get(), buf.data(), buf.size());
+  return heap_buf;
+}
+
+}  // namespace
+
+// Encapsulates common functionality for the various index reader
+// implementations. Provides access to the index block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader {
+ public:
+  IndexReaderCommon(const BlockBasedTable* t,
+                    CachableEntry<Block>&& index_block)
+      : table_(t), index_block_(std::move(index_block)) {
+    assert(table_ != nullptr);
+  }
+
+ protected:
+  static Status ReadIndexBlock(const BlockBasedTable* table,
+                               FilePrefetchBuffer* prefetch_buffer,
+                               const ReadOptions& read_options, bool use_cache,
+                               GetContext* get_context,
+                               BlockCacheLookupContext* lookup_context,
+                               CachableEntry<Block>* index_block);
+
+  const BlockBasedTable* table() const { return table_; }
+
+  const InternalKeyComparator* internal_comparator() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+
+    return &table_->get_rep()->internal_comparator;
+  }
+
+  bool index_has_first_key() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_has_first_key;
+  }
+
+  bool index_key_includes_seq() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_key_includes_seq;
+  }
+
+  bool index_value_is_full() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->index_value_is_full;
+  }
+
+  bool cache_index_blocks() const {
+    assert(table_ != nullptr);
+    assert(table_->get_rep() != nullptr);
+    return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+  }
+
+  Status GetOrReadIndexBlock(bool no_io, GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context,
+                             CachableEntry<Block>* index_block) const;
+
+  size_t ApproximateIndexBlockMemoryUsage() const {
+    assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr);
+    return index_block_.GetOwnValue()
+               ? index_block_.GetValue()->ApproximateMemoryUsage()
+               : 0;
+  }
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<Block> index_block_;
+};
+
+Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) {
+  PERF_TIMER_GUARD(read_index_block_nanos);
+
+  assert(table != nullptr);
+  assert(index_block != nullptr);
+  assert(index_block->IsEmpty());
+
+  const Rep* const rep = table->get_rep();
+  assert(rep != nullptr);
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->footer.index_handle(),
+      UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
+      get_context, lookup_context, /* for_compaction */ false, use_cache);
+
+  return s;
+}
+
+Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock(
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<Block>* index_block) const {
+  assert(index_block != nullptr);
+
+  if (!index_block_.IsEmpty()) {
+    index_block->SetUnownedValue(index_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadIndexBlock(table_, /*prefetch_buffer=*/nullptr, read_options,
+                        cache_index_blocks(), get_context, lookup_context,
+                        index_block);
+}
+
+// Index that allows binary search lookup in a two-level index structure.
+class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read the partition index from the file and create an instance for
+  // `PartitionIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader) {
+    assert(table != nullptr);
+    assert(table->get_rep());
+    assert(!pin || prefetch);
+    assert(index_reader != nullptr);
+
+    CachableEntry<Block> index_block;
+    if (prefetch || !use_cache) {
+      const Status s =
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
+                         /*get_context=*/nullptr, lookup_context, &index_block);
+      if (!s.ok()) {
+        return s;
+      }
+
+      if (use_cache && !pin) {
+        index_block.Reset();
+      }
+    }
+
+    index_reader->reset(
+        new PartitionIndexReader(table, std::move(index_block)));
+
+    return Status::OK();
+  }
+
+  // return a two-level iterator: first level is on the partition index
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
+    const bool no_io = (read_options.read_tier == kBlockCacheTier);
+    CachableEntry<Block> index_block;
+    const Status s =
+        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+    if (!s.ok()) {
+      if (iter != nullptr) {
+        iter->Invalidate(s);
+        return iter;
+      }
+
+      return NewErrorInternalIterator<IndexValue>(s);
+    }
+
+    InternalIteratorBase<IndexValue>* it = nullptr;
+
+    Statistics* kNullStats = nullptr;
+    // Filters are already checked before seeking the index
+    if (!partition_map_.empty()) {
+      // We don't return pinned data from index blocks, so no need
+      // to set `block_contents_pinned`.
+      it = NewTwoLevelIterator(
+          new BlockBasedTable::PartitionedIndexIteratorState(table(),
+                                                             &partition_map_),
+          index_block.GetValue()->NewIndexIterator(
+              internal_comparator(), internal_comparator()->user_comparator(),
+              nullptr, kNullStats, true, index_has_first_key(),
+              index_key_includes_seq(), index_value_is_full()));
+    } else {
+      ReadOptions ro;
+      ro.fill_cache = read_options.fill_cache;
+      // We don't return pinned data from index blocks, so no need
+      // to set `block_contents_pinned`.
+      it = new BlockBasedTableIterator<IndexBlockIter, IndexValue>(
+          table(), ro, *internal_comparator(),
+          index_block.GetValue()->NewIndexIterator(
+              internal_comparator(), internal_comparator()->user_comparator(),
+              nullptr, kNullStats, true, index_has_first_key(),
+              index_key_includes_seq(), index_value_is_full()),
+          false, true, /* prefix_extractor */ nullptr, BlockType::kIndex,
+          lookup_context ? lookup_context->caller
+                         : TableReaderCaller::kUncategorized);
+    }
+
+    assert(it != nullptr);
+    index_block.TransferTo(it);
+
+    return it;
+
+    // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
+    // on-stack BlockIter while the state is on heap. Currentlly it assumes
+    // the first level iter is always on heap and will attempt to delete it
+    // in its destructor.
+  }
+
+  void CacheDependencies(bool pin) override {
+    // Before read partitions, prefetch them to avoid lots of IOs
+    BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+    const BlockBasedTable::Rep* rep = table()->rep_;
+    IndexBlockIter biter;
+    BlockHandle handle;
+    Statistics* kNullStats = nullptr;
+
+    CachableEntry<Block> index_block;
+    Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
+                                   &lookup_context, &index_block);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep->ioptions.info_log,
+                     "Error retrieving top-level index block while trying to "
+                     "cache index partitions: %s",
+                     s.ToString().c_str());
+      return;
+    }
+
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    index_block.GetValue()->NewIndexIterator(
+        internal_comparator(), internal_comparator()->user_comparator(), &biter,
+        kNullStats, true, index_has_first_key(), index_key_includes_seq(),
+        index_value_is_full());
+    // Index partitions are assumed to be consecuitive. Prefetch them all.
+    // Read the first block offset
+    biter.SeekToFirst();
+    if (!biter.Valid()) {
+      // Empty index.
+      return;
+    }
+    handle = biter.value().handle;
+    uint64_t prefetch_off = handle.offset();
+
+    // Read the last block's offset
+    biter.SeekToLast();
+    if (!biter.Valid()) {
+      // Empty index.
+      return;
+    }
+    handle = biter.value().handle;
+    uint64_t last_off = handle.offset() + block_size(handle);
+    uint64_t prefetch_len = last_off - prefetch_off;
+    std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+    auto& file = rep->file;
+    prefetch_buffer.reset(new FilePrefetchBuffer());
+    s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
+                                  static_cast<size_t>(prefetch_len));
+
+    // After prefetch, read the partitions one by one
+    biter.SeekToFirst();
+    auto ro = ReadOptions();
+    for (; biter.Valid(); biter.Next()) {
+      handle = biter.value().handle;
+      CachableEntry<Block> block;
+      // TODO: Support counter batch update for partitioned index and
+      // filter blocks
+      s = table()->MaybeReadBlockAndLoadToCache(
+          prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
+          &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
+          /*contents=*/nullptr);
+
+      assert(s.ok() || block.GetValue() == nullptr);
+      if (s.ok() && block.GetValue() != nullptr) {
+        if (block.IsCached()) {
+          if (pin) {
+            partition_map_[handle.offset()] = std::move(block);
+          }
+        }
+      }
+    }
+  }
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<PartitionIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
+    return usage;
+  }
+
+ private:
+  PartitionIndexReader(const BlockBasedTable* t,
+                       CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  std::unordered_map<uint64_t, CachableEntry<Block>> partition_map_;
+};
+
+// Index that allows binary search lookup for the first key of each block.
+// This class can be viewed as a thin wrapper for `Block` class which already
+// supports binary search.
+class BinarySearchIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  // Read index from the file and create an intance for
+  // `BinarySearchIndexReader`.
+  // On success, index_reader will be populated; otherwise it will remain
+  // unmodified.
+  static Status Create(const BlockBasedTable* table,
+                       FilePrefetchBuffer* prefetch_buffer, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader) {
+    assert(table != nullptr);
+    assert(table->get_rep());
+    assert(!pin || prefetch);
+    assert(index_reader != nullptr);
+
+    CachableEntry<Block> index_block;
+    if (prefetch || !use_cache) {
+      const Status s =
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
+                         /*get_context=*/nullptr, lookup_context, &index_block);
+      if (!s.ok()) {
+        return s;
+      }
+
+      if (use_cache && !pin) {
+        index_block.Reset();
+      }
+    }
+
+    index_reader->reset(
+        new BinarySearchIndexReader(table, std::move(index_block)));
+
+    return Status::OK();
+  }
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool /* disable_prefix_seek */,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
+    const bool no_io = (read_options.read_tier == kBlockCacheTier);
+    CachableEntry<Block> index_block;
+    const Status s =
+        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+    if (!s.ok()) {
+      if (iter != nullptr) {
+        iter->Invalidate(s);
+        return iter;
+      }
+
+      return NewErrorInternalIterator<IndexValue>(s);
+    }
+
+    Statistics* kNullStats = nullptr;
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    auto it = index_block.GetValue()->NewIndexIterator(
+        internal_comparator(), internal_comparator()->user_comparator(), iter,
+        kNullStats, true, index_has_first_key(), index_key_includes_seq(),
+        index_value_is_full());
+
+    assert(it != nullptr);
+    index_block.TransferTo(it);
+
+    return it;
+  }
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<BinarySearchIndexReader*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  BinarySearchIndexReader(const BlockBasedTable* t,
+                          CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+};
+
+// Index that leverages an internal hash table to quicken the lookup for a given
+// key.
+class HashIndexReader : public BlockBasedTable::IndexReaderCommon {
+ public:
+  static Status Create(const BlockBasedTable* table,
+                       FilePrefetchBuffer* prefetch_buffer,
+                       InternalIterator* meta_index_iter, bool use_cache,
+                       bool prefetch, bool pin,
+                       BlockCacheLookupContext* lookup_context,
+                       std::unique_ptr<IndexReader>* index_reader) {
+    assert(table != nullptr);
+    assert(index_reader != nullptr);
+    assert(!pin || prefetch);
+
+    const BlockBasedTable::Rep* rep = table->get_rep();
+    assert(rep != nullptr);
+
+    CachableEntry<Block> index_block;
+    if (prefetch || !use_cache) {
+      const Status s =
+          ReadIndexBlock(table, prefetch_buffer, ReadOptions(), use_cache,
+                         /*get_context=*/nullptr, lookup_context, &index_block);
+      if (!s.ok()) {
+        return s;
+      }
+
+      if (use_cache && !pin) {
+        index_block.Reset();
+      }
+    }
+
+    // Note, failure to create prefix hash index does not need to be a
+    // hard error. We can still fall back to the original binary search index.
+    // So, Create will succeed regardless, from this point on.
+
+    index_reader->reset(new HashIndexReader(table, std::move(index_block)));
+
+    // Get prefixes block
+    BlockHandle prefixes_handle;
+    Status s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
+                             &prefixes_handle);
+    if (!s.ok()) {
+      // TODO: log error
+      return Status::OK();
+    }
+
+    // Get index metadata block
+    BlockHandle prefixes_meta_handle;
+    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
+                      &prefixes_meta_handle);
+    if (!s.ok()) {
+      // TODO: log error
+      return Status::OK();
+    }
+
+    RandomAccessFileReader* const file = rep->file.get();
+    const Footer& footer = rep->footer;
+    const ImmutableCFOptions& ioptions = rep->ioptions;
+    const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
+    MemoryAllocator* const memory_allocator =
+        GetMemoryAllocator(rep->table_options);
+
+    // Read contents for the blocks
+    BlockContents prefixes_contents;
+    BlockFetcher prefixes_block_fetcher(
+        file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
+        &prefixes_contents, ioptions, true /*decompress*/,
+        true /*maybe_compressed*/, BlockType::kHashIndexPrefixes,
+        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+    s = prefixes_block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      return s;
+    }
+    BlockContents prefixes_meta_contents;
+    BlockFetcher prefixes_meta_block_fetcher(
+        file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
+        &prefixes_meta_contents, ioptions, true /*decompress*/,
+        true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
+        UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+    s = prefixes_meta_block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      // TODO: log error
+      return Status::OK();
+    }
+
+    BlockPrefixIndex* prefix_index = nullptr;
+    s = BlockPrefixIndex::Create(rep->internal_prefix_transform.get(),
+                                 prefixes_contents.data,
+                                 prefixes_meta_contents.data, &prefix_index);
+    // TODO: log error
+    if (s.ok()) {
+      HashIndexReader* const hash_index_reader =
+          static_cast<HashIndexReader*>(index_reader->get());
+      hash_index_reader->prefix_index_.reset(prefix_index);
+    }
+
+    return Status::OK();
+  }
+
+  InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool disable_prefix_seek,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
+    const bool no_io = (read_options.read_tier == kBlockCacheTier);
+    CachableEntry<Block> index_block;
+    const Status s =
+        GetOrReadIndexBlock(no_io, get_context, lookup_context, &index_block);
+    if (!s.ok()) {
+      if (iter != nullptr) {
+        iter->Invalidate(s);
+        return iter;
+      }
+
+      return NewErrorInternalIterator<IndexValue>(s);
+    }
+
+    Statistics* kNullStats = nullptr;
+    const bool total_order_seek =
+        read_options.total_order_seek || disable_prefix_seek;
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    auto it = index_block.GetValue()->NewIndexIterator(
+        internal_comparator(), internal_comparator()->user_comparator(), iter,
+        kNullStats, total_order_seek, index_has_first_key(),
+        index_key_includes_seq(), index_value_is_full(),
+        false /* block_contents_pinned */, prefix_index_.get());
+
+    assert(it != nullptr);
+    index_block.TransferTo(it);
+
+    return it;
+  }
+
+  size_t ApproximateMemoryUsage() const override {
+    size_t usage = ApproximateIndexBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<HashIndexReader*>(this));
+#else
+    if (prefix_index_) {
+      usage += prefix_index_->ApproximateMemoryUsage();
+    }
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
+  }
+
+ private:
+  HashIndexReader(const BlockBasedTable* t, CachableEntry<Block>&& index_block)
+      : IndexReaderCommon(t, std::move(index_block)) {}
+
+  std::unique_ptr<BlockPrefixIndex> prefix_index_;
+};
+
+void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
+                                            GetContext* get_context,
+                                            size_t usage) const {
+  Statistics* const statistics = rep_->ioptions.statistics;
+
+  PERF_COUNTER_ADD(block_cache_hit_count, 1);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_hit;
+    get_context->get_context_stats_.num_cache_bytes_read += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_HIT);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_READ, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+      PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_HIT);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      // TODO: introduce perf counter for compression dictionary hit count
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+      }
+      break;
+
+    case BlockType::kIndex:
+      PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
+
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_HIT);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_hit;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_HIT);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
+                                             GetContext* get_context) const {
+  Statistics* const statistics = rep_->ioptions.statistics;
+
+  // TODO: introduce aggregate (not per-level) block cache miss count
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
+                            static_cast<uint32_t>(rep_->level));
+
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_miss;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_MISS);
+  }
+
+  // TODO: introduce perf counters for misses per block type
+  switch (block_type) {
+    case BlockType::kFilter:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_MISS);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_MISS);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_miss;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_MISS);
+      }
+      break;
+  }
+}
+
+void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
+                                                  GetContext* get_context,
+                                                  size_t usage) const {
+  Statistics* const statistics = rep_->ioptions.statistics;
+
+  // TODO: introduce perf counters for block cache insertions
+  if (get_context) {
+    ++get_context->get_context_stats_.num_cache_add;
+    get_context->get_context_stats_.num_cache_bytes_write += usage;
+  } else {
+    RecordTick(statistics, BLOCK_CACHE_ADD);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
+  }
+
+  switch (block_type) {
+    case BlockType::kFilter:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_filter_add;
+        get_context->get_context_stats_.num_cache_filter_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
+        RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
+      }
+      break;
+
+    case BlockType::kCompressionDictionary:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_compression_dict_add;
+        get_context->get_context_stats_
+            .num_cache_compression_dict_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+        RecordTick(statistics, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+                   usage);
+      }
+      break;
+
+    case BlockType::kIndex:
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_index_add;
+        get_context->get_context_stats_.num_cache_index_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
+        RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usage);
+      }
+      break;
+
+    default:
+      // TODO: introduce dedicated tickers/statistics/counters
+      // for range tombstones
+      if (get_context) {
+        ++get_context->get_context_stats_.num_cache_data_add;
+        get_context->get_context_stats_.num_cache_data_bytes_insert += usage;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
+        RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, usage);
+      }
+      break;
+  }
+}
+
+Cache::Handle* BlockBasedTable::GetEntryFromCache(
+    Cache* block_cache, const Slice& key, BlockType block_type,
+    GetContext* get_context) const {
+  auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
+
+  if (cache_handle != nullptr) {
+    UpdateCacheHitMetrics(block_type, get_context,
+                          block_cache->GetUsage(cache_handle));
+  } else {
+    UpdateCacheMissMetrics(block_type, get_context);
+  }
+
+  return cache_handle;
+}
+
+// Helper function to setup the cache key's prefix for the Table.
+void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
+  assert(kMaxCacheKeyPrefixSize >= 10);
+  rep->cache_key_prefix_size = 0;
+  rep->compressed_cache_key_prefix_size = 0;
+  if (rep->table_options.block_cache != nullptr) {
+    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
+                        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
+  }
+  if (rep->table_options.persistent_cache != nullptr) {
+    GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
+                        &rep->persistent_cache_key_prefix[0],
+                        &rep->persistent_cache_key_prefix_size);
+  }
+  if (rep->table_options.block_cache_compressed != nullptr) {
+    GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
+                        rep->file->file(), &rep->compressed_cache_key_prefix[0],
+                        &rep->compressed_cache_key_prefix_size);
+  }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
+                                          char* buffer, size_t* size) {
+  // generate an id from the file
+  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+  // If the prefix wasn't generated or was too long,
+  // create one from the cache.
+  if (cc != nullptr && *size == 0) {
+    char* end = EncodeVarint64(buffer, cc->NewId());
+    *size = static_cast<size_t>(end - buffer);
+  }
+}
+
+void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
+                                          char* buffer, size_t* size) {
+  // generate an id from the file
+  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
+
+  // If the prefix wasn't generated or was too long,
+  // create one from the cache.
+  if (cc != nullptr && *size == 0) {
+    char* end = EncodeVarint64(buffer, cc->NewId());
+    *size = static_cast<size_t>(end - buffer);
+  }
+}
+
+namespace {
+// Return True if table_properties has `user_prop_name` has a `true` value
+// or it doesn't contain this property (for backward compatible).
+bool IsFeatureSupported(const TableProperties& table_properties,
+                        const std::string& user_prop_name, Logger* info_log) {
+  auto& props = table_properties.user_collected_properties;
+  auto pos = props.find(user_prop_name);
+  // Older version doesn't have this value set. Skip this check.
+  if (pos != props.end()) {
+    if (pos->second == kPropFalse) {
+      return false;
+    } else if (pos->second != kPropTrue) {
+      ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
+                     user_prop_name.c_str(), pos->second.c_str());
+    }
+  }
+  return true;
+}
+
+// Caller has to ensure seqno is not nullptr.
+Status GetGlobalSequenceNumber(const TableProperties& table_properties,
+                               SequenceNumber largest_seqno,
+                               SequenceNumber* seqno) {
+  const auto& props = table_properties.user_collected_properties;
+  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
+  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+
+  *seqno = kDisableGlobalSequenceNumber;
+  if (version_pos == props.end()) {
+    if (seqno_pos != props.end()) {
+      std::array<char, 200> msg_buf;
+      // This is not an external sst file, global_seqno is not supported.
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
+          "A non-external sst file have global seqno property with value %s",
+          seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
+    }
+    return Status::OK();
+  }
+
+  uint32_t version = DecodeFixed32(version_pos->second.c_str());
+  if (version < 2) {
+    if (seqno_pos != props.end() || version != 1) {
+      std::array<char, 200> msg_buf;
+      // This is a v1 external sst file, global_seqno is not supported.
+      snprintf(msg_buf.data(), msg_buf.max_size(),
+               "An external sst file with version %u have global seqno "
+               "property with value %s",
+               version, seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
+    }
+    return Status::OK();
+  }
+
+  // Since we have a plan to deprecate global_seqno, we do not return failure
+  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
+  // SST is external.
+  SequenceNumber global_seqno(0);
+  if (seqno_pos != props.end()) {
+    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
+  }
+  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
+  // to denote it is unknown.
+  if (largest_seqno < kMaxSequenceNumber) {
+    if (global_seqno == 0) {
+      global_seqno = largest_seqno;
+    }
+    if (global_seqno != largest_seqno) {
+      std::array<char, 200> msg_buf;
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
+          "An external sst file with version %u have global seqno property "
+          "with value %s, while largest seqno in the file is %llu",
+          version, seqno_pos->second.c_str(),
+          static_cast<unsigned long long>(largest_seqno));
+      return Status::Corruption(msg_buf.data());
+    }
+  }
+  *seqno = global_seqno;
+
+  if (global_seqno > kMaxSequenceNumber) {
+    std::array<char, 200> msg_buf;
+    snprintf(msg_buf.data(), msg_buf.max_size(),
+             "An external sst file with version %u have global seqno property "
+             "with value %llu, which is greater than kMaxSequenceNumber",
+             version, static_cast<unsigned long long>(global_seqno));
+    return Status::Corruption(msg_buf.data());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
+                                   size_t cache_key_prefix_size,
+                                   const BlockHandle& handle, char* cache_key) {
+  assert(cache_key != nullptr);
+  assert(cache_key_prefix_size != 0);
+  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
+  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
+  char* end =
+      EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
+  return Slice(cache_key, static_cast<size_t>(end - cache_key));
+}
+
+Status BlockBasedTable::Open(
+    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+    const BlockBasedTableOptions& table_options,
+    const InternalKeyComparator& internal_comparator,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
+    const SliceTransform* prefix_extractor,
+    const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
+    const int level, const bool immortal_table,
+    const SequenceNumber largest_seqno, TailPrefetchStats* tail_prefetch_stats,
+    BlockCacheTracer* const block_cache_tracer) {
+  table_reader->reset();
+
+  Status s;
+  Footer footer;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+
+  // prefetch both index and filters, down to all partitions
+  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
+  const bool preload_all = !table_options.cache_index_and_filter_blocks;
+
+  s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
+                   preload_all, &prefetch_buffer);
+
+  // Read in the following order:
+  //    1. Footer
+  //    2. [metaindex block]
+  //    3. [meta block: properties]
+  //    4. [meta block: range deletion tombstone]
+  //    5. [meta block: compression dictionary]
+  //    6. [meta block: index]
+  //    7. [meta block: filter]
+  s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
+                         kBlockBasedTableMagicNumber);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!BlockBasedTableSupportedVersion(footer.version())) {
+    return Status::Corruption(
+        "Unknown Footer version. Maybe this file was created with newer "
+        "version of RocksDB?");
+  }
+
+  // We've successfully read the footer. We are ready to serve requests.
+  // Better not mutate rep_ after the creation. eg. internal_prefix_transform
+  // raw pointer will be used to create HashIndexReader, whose reset may
+  // access a dangling pointer.
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
+                                      internal_comparator, skip_filters, level,
+                                      immortal_table);
+  rep->file = std::move(file);
+  rep->footer = footer;
+  rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
+  // We need to wrap data with internal_prefix_transform to make sure it can
+  // handle prefix correctly.
+  rep->internal_prefix_transform.reset(
+      new InternalKeySliceTransform(prefix_extractor));
+  SetupCacheKeyPrefix(rep);
+  std::unique_ptr<BlockBasedTable> new_table(
+      new BlockBasedTable(rep, block_cache_tracer));
+
+  // page cache options
+  rep->persistent_cache_options =
+      PersistentCacheOptions(rep->table_options.persistent_cache,
+                             std::string(rep->persistent_cache_key_prefix,
+                                         rep->persistent_cache_key_prefix_size),
+                             rep->ioptions.statistics);
+
+  // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
+  // handle to null, otherwise it may be seen as uninitialized during the below
+  // meta-block reads.
+  rep->compression_dict_handle = BlockHandle::NullBlockHandle();
+
+  // Read metaindex
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  s = new_table->ReadMetaIndexBlock(prefetch_buffer.get(), &metaindex,
+                                    &metaindex_iter);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Populates table_properties and some fields that depend on it,
+  // such as index_type.
+  s = new_table->ReadPropertiesBlock(prefetch_buffer.get(),
+                                     metaindex_iter.get(), largest_seqno);
+  if (!s.ok()) {
+    return s;
+  }
+  s = new_table->ReadRangeDelBlock(prefetch_buffer.get(), metaindex_iter.get(),
+                                   internal_comparator, &lookup_context);
+  if (!s.ok()) {
+    return s;
+  }
+  s = new_table->PrefetchIndexAndFilterBlocks(
+      prefetch_buffer.get(), metaindex_iter.get(), new_table.get(),
+      prefetch_all, table_options, level, &lookup_context);
+
+  if (s.ok()) {
+    // Update tail prefetch stats
+    assert(prefetch_buffer.get() != nullptr);
+    if (tail_prefetch_stats != nullptr) {
+      assert(prefetch_buffer->min_offset_read() < file_size);
+      tail_prefetch_stats->RecordEffectiveSize(
+          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
+    }
+
+    *table_reader = std::move(new_table);
+  }
+
+  return s;
+}
+
+Status BlockBasedTable::PrefetchTail(
+    RandomAccessFileReader* file, uint64_t file_size,
+    TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
+    const bool preload_all,
+    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
+  size_t tail_prefetch_size = 0;
+  if (tail_prefetch_stats != nullptr) {
+    // Multiple threads may get a 0 (no history) when running in parallel,
+    // but it will get cleared after the first of them finishes.
+    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
+  }
+  if (tail_prefetch_size == 0) {
+    // Before read footer, readahead backwards to prefetch data. Do more
+    // readahead if we're going to read index/filter.
+    // TODO: This may incorrectly select small readahead in case partitioned
+    // index/filter is enabled and top-level partition pinning is enabled.
+    // That's because we need to issue readahead before we read the properties,
+    // at which point we don't yet know the index type.
+    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
+  }
+  size_t prefetch_off;
+  size_t prefetch_len;
+  if (file_size < tail_prefetch_size) {
+    prefetch_off = 0;
+    prefetch_len = static_cast<size_t>(file_size);
+  } else {
+    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
+    prefetch_len = tail_prefetch_size;
+  }
+  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
+                           &tail_prefetch_size);
+  Status s;
+  // TODO should not have this special logic in the future.
+  if (!file->use_direct_io()) {
+    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true));
+    s = file->Prefetch(prefetch_off, prefetch_len);
+  } else {
+    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
+    s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
+  }
+  return s;
+}
+
+Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
+                      uint32_t expected) {
+  Status s;
+  uint32_t actual = 0;
+  switch (type) {
+    case kNoChecksum:
+      break;
+    case kCRC32c:
+      expected = crc32c::Unmask(expected);
+      actual = crc32c::Value(buf, len);
+      break;
+    case kxxHash:
+      actual = XXH32(buf, static_cast<int>(len), 0);
+      break;
+    case kxxHash64:
+      actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
+                                     uint64_t{0xffffffff});
+      break;
+    default:
+      s = Status::Corruption("unknown checksum type");
+  }
+  if (s.ok() && actual != expected) {
+    s = Status::Corruption("properties block checksum mismatched");
+  }
+  return s;
+}
+
+Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
+    FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
+    TableProperties** table_properties) {
+  assert(table_properties != nullptr);
+  // If this is an external SST file ingested with write_global_seqno set to
+  // true, then we expect the checksum mismatch because checksum was written
+  // by SstFileWriter, but its global seqno in the properties block may have
+  // been changed during ingestion. In this case, we read the properties
+  // block, copy it to a memory buffer, change the global seqno to its
+  // original value, i.e. 0, and verify the checksum again.
+  BlockHandle props_block_handle;
+  CacheAllocationPtr tmp_buf;
+  Status s = ReadProperties(handle_value, rep_->file.get(), prefetch_buffer,
+                            rep_->footer, rep_->ioptions, table_properties,
+                            false /* verify_checksum */, &props_block_handle,
+                            &tmp_buf, false /* compression_type_missing */,
+                            nullptr /* memory_allocator */);
+  if (s.ok() && tmp_buf) {
+    const auto seqno_pos_iter =
+        (*table_properties)
+            ->properties_offsets.find(
+                ExternalSstFilePropertyNames::kGlobalSeqno);
+    size_t block_size = static_cast<size_t>(props_block_handle.size());
+    if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
+      uint64_t global_seqno_offset = seqno_pos_iter->second;
+      EncodeFixed64(
+          tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
+    }
+    uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
+    s = rocksdb::VerifyChecksum(rep_->footer.checksum(), tmp_buf.get(),
+                                block_size + 1, value);
+  }
+  return s;
+}
+
+Status BlockBasedTable::ReadPropertiesBlock(
+    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    const SequenceNumber largest_seqno) {
+  bool found_properties_block = true;
+  Status s;
+  s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(rep_->ioptions.info_log,
+                   "Error when seeking to properties block from file: %s",
+                   s.ToString().c_str());
+  } else if (found_properties_block) {
+    s = meta_iter->status();
+    TableProperties* table_properties = nullptr;
+    if (s.ok()) {
+      s = ReadProperties(
+          meta_iter->value(), rep_->file.get(), prefetch_buffer, rep_->footer,
+          rep_->ioptions, &table_properties, true /* verify_checksum */,
+          nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
+          false /* compression_type_missing */, nullptr /* memory_allocator */);
+    }
+
+    if (s.IsCorruption()) {
+      s = TryReadPropertiesWithGlobalSeqno(prefetch_buffer, meta_iter->value(),
+                                           &table_properties);
+    }
+    std::unique_ptr<TableProperties> props_guard;
+    if (table_properties != nullptr) {
+      props_guard.reset(table_properties);
+    }
+
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(rep_->ioptions.info_log,
+                     "Encountered error while reading data from properties "
+                     "block %s",
+                     s.ToString().c_str());
+    } else {
+      assert(table_properties != nullptr);
+      rep_->table_properties.reset(props_guard.release());
+      rep_->blocks_maybe_compressed =
+          rep_->table_properties->compression_name !=
+          CompressionTypeToString(kNoCompression);
+      rep_->blocks_definitely_zstd_compressed =
+          (rep_->table_properties->compression_name ==
+               CompressionTypeToString(kZSTD) ||
+           rep_->table_properties->compression_name ==
+               CompressionTypeToString(kZSTDNotFinalCompression));
+    }
+  } else {
+    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+                    "Cannot find Properties block from file.");
+  }
+#ifndef ROCKSDB_LITE
+  if (rep_->table_properties) {
+    ParseSliceTransform(rep_->table_properties->prefix_extractor_name,
+                        &(rep_->table_prefix_extractor));
+  }
+#endif  // ROCKSDB_LITE
+
+  // Read the table properties, if provided.
+  if (rep_->table_properties) {
+    rep_->whole_key_filtering &=
+        IsFeatureSupported(*(rep_->table_properties),
+                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                           rep_->ioptions.info_log);
+    rep_->prefix_filtering &=
+        IsFeatureSupported(*(rep_->table_properties),
+                           BlockBasedTablePropertyNames::kPrefixFiltering,
+                           rep_->ioptions.info_log);
+
+    rep_->index_key_includes_seq =
+        rep_->table_properties->index_key_is_user_key == 0;
+    rep_->index_value_is_full =
+        rep_->table_properties->index_value_is_delta_encoded == 0;
+
+    // Update index_type with the true type.
+    // If table properties don't contain index type, we assume that the table
+    // is in very old format and has kBinarySearch index type.
+    auto& props = rep_->table_properties->user_collected_properties;
+    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+    if (pos != props.end()) {
+      rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
+          DecodeFixed32(pos->second.c_str()));
+    }
+
+    rep_->index_has_first_key =
+        rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
+
+    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
+                                &(rep_->global_seqno));
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::ReadRangeDelBlock(
+    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    const InternalKeyComparator& internal_comparator,
+    BlockCacheLookupContext* lookup_context) {
+  Status s;
+  bool found_range_del_block;
+  BlockHandle range_del_handle;
+  s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        rep_->ioptions.info_log,
+        "Error when seeking to range delete tombstones block from file: %s",
+        s.ToString().c_str());
+  } else if (found_range_del_block && !range_del_handle.IsNull()) {
+    ReadOptions read_options;
+    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
+        read_options, range_del_handle,
+        /*input_iter=*/nullptr, BlockType::kRangeDeletion,
+        /*get_context=*/nullptr, lookup_context, Status(), prefetch_buffer));
+    assert(iter != nullptr);
+    s = iter->status();
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(
+          rep_->ioptions.info_log,
+          "Encountered error while reading data from range del block %s",
+          s.ToString().c_str());
+    } else {
+      rep_->fragmented_range_dels =
+          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
+                                                         internal_comparator);
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
+    FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    BlockBasedTable* new_table, bool prefetch_all,
+    const BlockBasedTableOptions& table_options, const int level,
+    BlockCacheLookupContext* lookup_context) {
+  Status s;
+
+  // Find filter handle and filter type
+  if (rep_->filter_policy) {
+    for (auto filter_type :
+         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
+          Rep::FilterType::kBlockFilter}) {
+      std::string prefix;
+      switch (filter_type) {
+        case Rep::FilterType::kFullFilter:
+          prefix = kFullFilterBlockPrefix;
+          break;
+        case Rep::FilterType::kPartitionedFilter:
+          prefix = kPartitionedFilterBlockPrefix;
+          break;
+        case Rep::FilterType::kBlockFilter:
+          prefix = kFilterBlockPrefix;
+          break;
+        default:
+          assert(0);
+      }
+      std::string filter_block_key = prefix;
+      filter_block_key.append(rep_->filter_policy->Name());
+      if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
+              .ok()) {
+        rep_->filter_type = filter_type;
+        break;
+      }
+    }
+  }
+
+  // Find compression dictionary handle
+  bool found_compression_dict = false;
+  s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
+                                 &rep_->compression_dict_handle);
+  if (!s.ok()) {
+    return s;
+  }
+
+  BlockBasedTableOptions::IndexType index_type = rep_->index_type;
+
+  const bool use_cache = table_options.cache_index_and_filter_blocks;
+
+  // pin both index and filters, down to all partitions
+  const bool pin_all =
+      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
+
+  // prefetch the first level of index
+  const bool prefetch_index =
+      prefetch_all ||
+      (table_options.pin_top_level_index_and_filter &&
+       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+  // pin the first level of index
+  const bool pin_index =
+      pin_all || (table_options.pin_top_level_index_and_filter &&
+                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+
+  std::unique_ptr<IndexReader> index_reader;
+  s = new_table->CreateIndexReader(prefetch_buffer, meta_iter, use_cache,
+                                   prefetch_index, pin_index, lookup_context,
+                                   &index_reader);
+  if (!s.ok()) {
+    return s;
+  }
+
+  rep_->index_reader = std::move(index_reader);
+
+  // The partitions of partitioned index are always stored in cache. They
+  // are hence follow the configuration for pin and prefetch regardless of
+  // the value of cache_index_and_filter_blocks
+  if (prefetch_all) {
+    rep_->index_reader->CacheDependencies(pin_all);
+  }
+
+  // prefetch the first level of filter
+  const bool prefetch_filter =
+      prefetch_all ||
+      (table_options.pin_top_level_index_and_filter &&
+       rep_->filter_type == Rep::FilterType::kPartitionedFilter);
+  // Partition fitlers cannot be enabled without partition indexes
+  assert(!prefetch_filter || prefetch_index);
+  // pin the first level of filter
+  const bool pin_filter =
+      pin_all || (table_options.pin_top_level_index_and_filter &&
+                  rep_->filter_type == Rep::FilterType::kPartitionedFilter);
+
+  if (rep_->filter_policy) {
+    auto filter = new_table->CreateFilterBlockReader(
+        prefetch_buffer, use_cache, prefetch_filter, pin_filter,
+        lookup_context);
+    if (filter) {
+      // Refer to the comment above about paritioned indexes always being cached
+      if (prefetch_all) {
+        filter->CacheDependencies(pin_all);
+      }
+
+      rep_->filter = std::move(filter);
+    }
+  }
+
+  if (!rep_->compression_dict_handle.IsNull()) {
+    std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
+    s = UncompressionDictReader::Create(this, prefetch_buffer, use_cache,
+                                        prefetch_all, pin_all, lookup_context,
+                                        &uncompression_dict_reader);
+    if (!s.ok()) {
+      return s;
+    }
+
+    rep_->uncompression_dict_reader = std::move(uncompression_dict_reader);
+  }
+
+  assert(s.ok());
+  return s;
+}
+
+void BlockBasedTable::SetupForCompaction() {
+  switch (rep_->ioptions.access_hint_on_compaction_start) {
+    case Options::NONE:
+      break;
+    case Options::NORMAL:
+      rep_->file->file()->Hint(RandomAccessFile::NORMAL);
+      break;
+    case Options::SEQUENTIAL:
+      rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
+      break;
+    case Options::WILLNEED:
+      rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
+      break;
+    default:
+      assert(false);
+  }
+}
+
+std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
+    const {
+  return rep_->table_properties;
+}
+
+size_t BlockBasedTable::ApproximateMemoryUsage() const {
+  size_t usage = 0;
+  if (rep_->filter) {
+    usage += rep_->filter->ApproximateMemoryUsage();
+  }
+  if (rep_->index_reader) {
+    usage += rep_->index_reader->ApproximateMemoryUsage();
+  }
+  if (rep_->uncompression_dict_reader) {
+    usage += rep_->uncompression_dict_reader->ApproximateMemoryUsage();
+  }
+  return usage;
+}
+
+// Load the meta-index-block from the file. On success, return the loaded
+// metaindex
+// block and its iterator.
+Status BlockBasedTable::ReadMetaIndexBlock(
+    FilePrefetchBuffer* prefetch_buffer,
+    std::unique_ptr<Block>* metaindex_block,
+    std::unique_ptr<InternalIterator>* iter) {
+  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
+  // it is an empty block.
+  std::unique_ptr<Block> metaindex;
+  Status s = ReadBlockFromFile(
+      rep_->file.get(), prefetch_buffer, rep_->footer, ReadOptions(),
+      rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
+      true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
+      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
+      GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
+      rep_->blocks_definitely_zstd_compressed, nullptr /* filter_policy */);
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+                    "Encountered error while reading data from properties"
+                    " block %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  *metaindex_block = std::move(metaindex);
+  // meta block uses bytewise comparator.
+  iter->reset(metaindex_block->get()->NewDataIterator(BytewiseComparator(),
+                                                      BytewiseComparator()));
+  return Status::OK();
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::GetDataBlockFromCache(
+    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+    Cache* block_cache, Cache* block_cache_compressed,
+    const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
+    const UncompressionDict& uncompression_dict, BlockType block_type,
+    GetContext* get_context) const {
+  const size_t read_amp_bytes_per_bit =
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
+  assert(block);
+  assert(block->IsEmpty());
+
+  Status s;
+  BlockContents* compressed_block = nullptr;
+  Cache::Handle* block_cache_compressed_handle = nullptr;
+
+  // Lookup uncompressed cache first
+  if (block_cache != nullptr) {
+    auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
+                                          block_type, get_context);
+    if (cache_handle != nullptr) {
+      block->SetCachedValue(
+          reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
+          block_cache, cache_handle);
+      return s;
+    }
+  }
+
+  // If not found, search from the compressed block cache.
+  assert(block->IsEmpty());
+
+  if (block_cache_compressed == nullptr) {
+    return s;
+  }
+
+  assert(!compressed_block_cache_key.empty());
+  block_cache_compressed_handle =
+      block_cache_compressed->Lookup(compressed_block_cache_key);
+
+  Statistics* statistics = rep_->ioptions.statistics;
+
+  // if we found in the compressed cache, then uncompress and insert into
+  // uncompressed cache
+  if (block_cache_compressed_handle == nullptr) {
+    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
+    return s;
+  }
+
+  // found compressed block
+  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
+  compressed_block = reinterpret_cast<BlockContents*>(
+      block_cache_compressed->Value(block_cache_compressed_handle));
+  CompressionType compression_type = compressed_block->get_compression_type();
+  assert(compression_type != kNoCompression);
+
+  // Retrieve the uncompressed contents into a new buffer
+  BlockContents contents;
+  UncompressionContext context(compression_type);
+  UncompressionInfo info(context, uncompression_dict, compression_type);
+  s = UncompressBlockContents(
+      info, compressed_block->data.data(), compressed_block->data.size(),
+      &contents, rep_->table_options.format_version, rep_->ioptions,
+      GetMemoryAllocator(rep_->table_options));
+
+  // Insert uncompressed block into block cache
+  if (s.ok()) {
+    std::unique_ptr<TBlocklike> block_holder(
+        BlocklikeTraits<TBlocklike>::Create(
+            std::move(contents), rep_->get_global_seqno(block_type),
+            read_amp_bytes_per_bit, statistics,
+            rep_->blocks_definitely_zstd_compressed,
+            rep_->table_options.filter_policy.get()));  // uncompressed block
+
+    if (block_cache != nullptr && block_holder->own_bytes() &&
+        read_options.fill_cache) {
+      size_t charge = block_holder->ApproximateMemoryUsage();
+      Cache::Handle* cache_handle = nullptr;
+      s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
+                              &DeleteCachedEntry<TBlocklike>, &cache_handle);
+      if (s.ok()) {
+        assert(cache_handle != nullptr);
+        block->SetCachedValue(block_holder.release(), block_cache,
+                              cache_handle);
+
+        UpdateCacheInsertionMetrics(block_type, get_context, charge);
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+      }
+    } else {
+      block->SetOwnedValue(block_holder.release());
+    }
+  }
+
+  // Release hold on compressed cache entry
+  block_cache_compressed->Release(block_cache_compressed_handle);
+  return s;
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::PutDataBlockToCache(
+    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
+    Cache* block_cache, Cache* block_cache_compressed,
+    CachableEntry<TBlocklike>* cached_block, BlockContents* raw_block_contents,
+    CompressionType raw_block_comp_type,
+    const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
+    MemoryAllocator* memory_allocator, BlockType block_type,
+    GetContext* get_context) const {
+  const ImmutableCFOptions& ioptions = rep_->ioptions;
+  const uint32_t format_version = rep_->table_options.format_version;
+  const size_t read_amp_bytes_per_bit =
+      block_type == BlockType::kData
+          ? rep_->table_options.read_amp_bytes_per_bit
+          : 0;
+  const Cache::Priority priority =
+      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+              (block_type == BlockType::kFilter ||
+               block_type == BlockType::kCompressionDictionary ||
+               block_type == BlockType::kIndex)
+          ? Cache::Priority::HIGH
+          : Cache::Priority::LOW;
+  assert(cached_block);
+  assert(cached_block->IsEmpty());
+
+  Status s;
+  Statistics* statistics = ioptions.statistics;
+
+  std::unique_ptr<TBlocklike> block_holder;
+  if (raw_block_comp_type != kNoCompression) {
+    // Retrieve the uncompressed contents into a new buffer
+    BlockContents uncompressed_block_contents;
+    UncompressionContext context(raw_block_comp_type);
+    UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
+    s = UncompressBlockContents(info, raw_block_contents->data.data(),
+                                raw_block_contents->data.size(),
+                                &uncompressed_block_contents, format_version,
+                                ioptions, memory_allocator);
+    if (!s.ok()) {
+      return s;
+    }
+
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(uncompressed_block_contents), seq_no, read_amp_bytes_per_bit,
+        statistics, rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
+  } else {
+    block_holder.reset(BlocklikeTraits<TBlocklike>::Create(
+        std::move(*raw_block_contents), seq_no, read_amp_bytes_per_bit,
+        statistics, rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get()));
+  }
+
+  // Insert compressed block into compressed block cache.
+  // Release the hold on the compressed cache entry immediately.
+  if (block_cache_compressed != nullptr &&
+      raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
+      raw_block_contents->own_bytes()) {
+#ifndef NDEBUG
+    assert(raw_block_contents->is_raw_block);
+#endif  // NDEBUG
+
+    // We cannot directly put raw_block_contents because this could point to
+    // an object in the stack.
+    BlockContents* block_cont_for_comp_cache =
+        new BlockContents(std::move(*raw_block_contents));
+    s = block_cache_compressed->Insert(
+        compressed_block_cache_key, block_cont_for_comp_cache,
+        block_cont_for_comp_cache->ApproximateMemoryUsage(),
+        &DeleteCachedEntry<BlockContents>);
+    if (s.ok()) {
+      // Avoid the following code to delete this cached block.
+      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+      delete block_cont_for_comp_cache;
+    }
+  }
+
+  // insert into uncompressed block cache
+  if (block_cache != nullptr && block_holder->own_bytes()) {
+    size_t charge = block_holder->ApproximateMemoryUsage();
+    Cache::Handle* cache_handle = nullptr;
+    s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
+                            &DeleteCachedEntry<TBlocklike>, &cache_handle,
+                            priority);
+    if (s.ok()) {
+      assert(cache_handle != nullptr);
+      cached_block->SetCachedValue(block_holder.release(), block_cache,
+                                   cache_handle);
+
+      UpdateCacheInsertionMetrics(block_type, get_context, charge);
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
+    }
+  } else {
+    cached_block->SetOwnedValue(block_holder.release());
+  }
+
+  return s;
+}
+
+std::unique_ptr<FilterBlockReader> BlockBasedTable::CreateFilterBlockReader(
+    FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context) {
+  auto& rep = rep_;
+  auto filter_type = rep->filter_type;
+  if (filter_type == Rep::FilterType::kNoFilter) {
+    return std::unique_ptr<FilterBlockReader>();
+  }
+
+  assert(rep->filter_policy);
+
+  switch (filter_type) {
+    case Rep::FilterType::kPartitionedFilter:
+      return PartitionedFilterBlockReader::Create(
+          this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+
+    case Rep::FilterType::kBlockFilter:
+      return BlockBasedFilterBlockReader::Create(
+          this, prefetch_buffer, use_cache, prefetch, pin, lookup_context);
+
+    case Rep::FilterType::kFullFilter:
+      return FullFilterBlockReader::Create(this, prefetch_buffer, use_cache,
+                                           prefetch, pin, lookup_context);
+
+    default:
+      // filter_type is either kNoFilter (exited the function at the first if),
+      // or it must be covered in this switch block
+      assert(false);
+      return std::unique_ptr<FilterBlockReader>();
+  }
+}
+
+// disable_prefix_seek should be set to true when prefix_extractor found in SST
+// differs from the one in mutable_cf_options and index type is HashBasedIndex
+InternalIteratorBase<IndexValue>* BlockBasedTable::NewIndexIterator(
+    const ReadOptions& read_options, bool disable_prefix_seek,
+    IndexBlockIter* input_iter, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
+  assert(rep_ != nullptr);
+  assert(rep_->index_reader != nullptr);
+
+  // We don't return pinned data from index blocks, so no need
+  // to set `block_contents_pinned`.
+  return rep_->index_reader->NewIterator(read_options, disable_prefix_seek,
+                                         input_iter, get_context,
+                                         lookup_context);
+}
+
+// Convert an index iterator value (i.e., an encoded BlockHandle)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(
+    const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
+    BlockType block_type, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context, Status s,
+    FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  CachableEntry<UncompressionDict> uncompression_dict;
+  if (rep_->uncompression_dict_reader) {
+    const bool no_io = (ro.read_tier == kBlockCacheTier);
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        prefetch_buffer, no_io, get_context, lookup_context,
+        &uncompression_dict);
+    if (!s.ok()) {
+      iter->Invalidate(s);
+      return iter;
+    }
+  }
+
+  const UncompressionDict& dict = uncompression_dict.GetValue()
+                                      ? *uncompression_dict.GetValue()
+                                      : UncompressionDict::GetEmptyDict();
+
+  CachableEntry<Block> block;
+  s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
+                    get_context, lookup_context, for_compaction,
+                    /* use_cache */ true);
+
+  if (!s.ok()) {
+    assert(block.IsEmpty());
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
+                                       block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
+      // insert a dummy record to block cache to track the memory usage
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      Cache::Handle* cache_handle = nullptr;
+      // There are two other types of cache keys: 1) SST cache key added in
+      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
+      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
+      // from SST cache key(31 bytes), and use non-zero prefix to
+      // differentiate from `write_buffer_manager`
+      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
+      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
+      // Prefix: use rep_->cache_key_prefix padded by 0s
+      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
+      assert(rep_->cache_key_prefix_size != 0);
+      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
+      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
+                                 next_cache_key_id_++);
+      assert(end - cache_key <=
+             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
+      s = block_cache->Insert(unique_key, nullptr,
+                              block.GetValue()->ApproximateMemoryUsage(),
+                              nullptr, &cache_handle);
+
+      if (s.ok()) {
+        assert(cache_handle != nullptr);
+        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                              cache_handle);
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+
+  return iter;
+}
+
+template <>
+DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
+    const Rep* rep, Block* block, DataBlockIter* input_iter,
+    bool block_contents_pinned) {
+  return block->NewDataIterator(
+      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+      input_iter, rep->ioptions.statistics, block_contents_pinned);
+}
+
+template <>
+IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
+    const Rep* rep, Block* block, IndexBlockIter* input_iter,
+    bool block_contents_pinned) {
+  return block->NewIndexIterator(
+      &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+      input_iter, rep->ioptions.statistics, /* total_order_seek */ true,
+      rep->index_has_first_key, rep->index_key_includes_seq,
+      rep->index_value_is_full, block_contents_pinned);
+}
+
+// Convert an uncompressed data block (i.e CachableEntry<Block>)
+// into an iterator over the contents of the corresponding block.
+// If input_iter is null, new a iterator
+// If input_iter is not null, update this iter and return it
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
+                                                  CachableEntry<Block>& block,
+                                                  TBlockIter* input_iter,
+                                                  Status s) const {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
+  TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
+  if (!s.ok()) {
+    iter->Invalidate(s);
+    return iter;
+  }
+
+  assert(block.GetValue() != nullptr);
+  // Block contents are pinned and it is still pinned after the iterator
+  // is destroyed as long as cleanup functions are moved to another object,
+  // when:
+  // 1. block cache handle is set to be released in cleanup function, or
+  // 2. it's pointing to immortal source. If own_bytes is true then we are
+  //    not reading data from the original source, whether immortal or not.
+  //    Otherwise, the block is pinned iff the source is immortal.
+  const bool block_contents_pinned =
+      block.IsCached() ||
+      (!block.GetValue()->own_bytes() && rep_->immortal_table);
+  iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), iter,
+                                       block_contents_pinned);
+
+  if (!block.IsCached()) {
+    if (!ro.fill_cache && rep_->cache_key_prefix_size != 0) {
+      // insert a dummy record to block cache to track the memory usage
+      Cache* const block_cache = rep_->table_options.block_cache.get();
+      Cache::Handle* cache_handle = nullptr;
+      // There are two other types of cache keys: 1) SST cache key added in
+      // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
+      // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
+      // from SST cache key(31 bytes), and use non-zero prefix to
+      // differentiate from `write_buffer_manager`
+      const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
+      char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
+      // Prefix: use rep_->cache_key_prefix padded by 0s
+      memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
+      assert(rep_->cache_key_prefix_size != 0);
+      assert(rep_->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+      memcpy(cache_key, rep_->cache_key_prefix, rep_->cache_key_prefix_size);
+      char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
+                                 next_cache_key_id_++);
+      assert(end - cache_key <=
+             static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+      const Slice unique_key(cache_key, static_cast<size_t>(end - cache_key));
+      s = block_cache->Insert(unique_key, nullptr,
+                              block.GetValue()->ApproximateMemoryUsage(),
+                              nullptr, &cache_handle);
+      if (s.ok()) {
+        assert(cache_handle != nullptr);
+        iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                              cache_handle);
+      }
+    }
+  } else {
+    iter->SetCacheHandle(block.GetCacheHandle());
+  }
+
+  block.TransferTo(iter);
+  return iter;
+}
+
+// If contents is nullptr, this function looks up the block caches for the
+// data block referenced by handle, and read the block from disk if necessary.
+// If contents is non-null, it skips the cache lookup and disk read, since
+// the caller has already read it. In both cases, if ro.fill_cache is true,
+// it inserts the block into the block cache.
+template <typename TBlocklike>
+Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    BlockContents* contents) const {
+  assert(block_entry != nullptr);
+  const bool no_io = (ro.read_tier == kBlockCacheTier);
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  // No point to cache compressed blocks if it never goes away
+  Cache* block_cache_compressed =
+      rep_->immortal_table ? nullptr
+                           : rep_->table_options.block_cache_compressed.get();
+
+  // First, try to get the block from the cache
+  //
+  // If either block cache is enabled, we'll try to read from it.
+  Status s;
+  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  Slice key /* key to the block cache */;
+  Slice ckey /* key to the compressed block cache */;
+  bool is_cache_hit = false;
+  if (block_cache != nullptr || block_cache_compressed != nullptr) {
+    // create key for block cache
+    if (block_cache != nullptr) {
+      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                        handle, cache_key);
+    }
+
+    if (block_cache_compressed != nullptr) {
+      ckey = GetCacheKey(rep_->compressed_cache_key_prefix,
+                         rep_->compressed_cache_key_prefix_size, handle,
+                         compressed_cache_key);
+    }
+
+    if (!contents) {
+      s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
+                                ro, block_entry, uncompression_dict, block_type,
+                                get_context);
+      if (block_entry->GetValue()) {
+        // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
+        // compressed block cache.
+        is_cache_hit = true;
+      }
+    }
+
+    // Can't find the block from the cache. If I/O is allowed, read from the
+    // file.
+    if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
+      Statistics* statistics = rep_->ioptions.statistics;
+      const bool maybe_compressed =
+          block_type != BlockType::kFilter &&
+          block_type != BlockType::kCompressionDictionary &&
+          rep_->blocks_maybe_compressed;
+      const bool do_uncompress = maybe_compressed && !block_cache_compressed;
+      CompressionType raw_block_comp_type;
+      BlockContents raw_block_contents;
+      if (!contents) {
+        StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
+        BlockFetcher block_fetcher(
+            rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
+            &raw_block_contents, rep_->ioptions, do_uncompress,
+            maybe_compressed, block_type, uncompression_dict,
+            rep_->persistent_cache_options,
+            GetMemoryAllocator(rep_->table_options),
+            GetMemoryAllocatorForCompressedBlock(rep_->table_options));
+        s = block_fetcher.ReadBlockContents();
+        raw_block_comp_type = block_fetcher.get_compression_type();
+        contents = &raw_block_contents;
+      } else {
+        raw_block_comp_type = contents->get_compression_type();
+      }
+
+      if (s.ok()) {
+        SequenceNumber seq_no = rep_->get_global_seqno(block_type);
+        // If filling cache is allowed and a cache is configured, try to put the
+        // block to the cache.
+        s = PutDataBlockToCache(
+            key, ckey, block_cache, block_cache_compressed, block_entry,
+            contents, raw_block_comp_type, uncompression_dict, seq_no,
+            GetMemoryAllocator(rep_->table_options), block_type, get_context);
+      }
+    }
+  }
+
+  // Fill lookup_context.
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() &&
+      lookup_context) {
+    size_t usage = 0;
+    uint64_t nkeys = 0;
+    if (block_entry->GetValue()) {
+      // Approximate the number of keys in the block using restarts.
+      nkeys =
+          rep_->table_options.block_restart_interval *
+          BlocklikeTraits<TBlocklike>::GetNumRestarts(*block_entry->GetValue());
+      usage = block_entry->GetValue()->ApproximateMemoryUsage();
+    }
+    TraceType trace_block_type = TraceType::kTraceMax;
+    switch (block_type) {
+      case BlockType::kData:
+        trace_block_type = TraceType::kBlockTraceDataBlock;
+        break;
+      case BlockType::kFilter:
+        trace_block_type = TraceType::kBlockTraceFilterBlock;
+        break;
+      case BlockType::kCompressionDictionary:
+        trace_block_type = TraceType::kBlockTraceUncompressionDictBlock;
+        break;
+      case BlockType::kRangeDeletion:
+        trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
+        break;
+      case BlockType::kIndex:
+        trace_block_type = TraceType::kBlockTraceIndexBlock;
+        break;
+      default:
+        // This cannot happen.
+        assert(false);
+        break;
+    }
+    bool no_insert = no_io || !ro.fill_cache;
+    if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
+            trace_block_type, lookup_context->caller)) {
+      // Defer logging the access to Get() and MultiGet() to trace additional
+      // information, e.g., referenced_key_exist_in_block.
+
+      // Make a copy of the block key here since it will be logged later.
+      lookup_context->FillLookupContext(
+          is_cache_hit, no_insert, trace_block_type,
+          /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys);
+    } else {
+      // Avoid making copy of block_key and cf_name when constructing the access
+      // record.
+      BlockCacheTraceRecord access_record(
+          rep_->ioptions.env->NowMicros(),
+          /*block_key=*/"", trace_block_type,
+          /*block_size=*/usage, rep_->cf_id_for_tracing(),
+          /*cf_name=*/"", rep_->level_for_tracing(),
+          rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit,
+          no_insert, lookup_context->get_id,
+          lookup_context->get_from_user_specified_snapshot,
+          /*referenced_key=*/"");
+      block_cache_tracer_->WriteBlockAccess(access_record, key,
+                                            rep_->cf_name_for_tracing(),
+                                            lookup_context->referenced_key);
+    }
+  }
+
+  assert(s.ok() || block_entry->GetValue() == nullptr);
+  return s;
+}
+
+// This function reads multiple data blocks from disk using Env::MultiRead()
+// and optionally inserts them into the block cache. It uses the scratch
+// buffer provided by the caller, which is contiguous. If scratch is a nullptr
+// it allocates a separate buffer for each block. Typically, if the blocks
+// need to be uncompressed and there is no compressed block cache, callers
+// can allocate a temporary scratch buffer in order to minimize memory
+// allocations.
+// If options.fill_cache is true, it inserts the blocks into cache. If its
+// false and scratch is non-null and the blocks are uncompressed, it copies
+// the buffers to heap. In any case, the CachableEntry<Block> returned will
+// own the data bytes.
+// batch - A MultiGetRange with only those keys with unique data blocks not
+//         found in cache
+// handles - A vector of block handles. Some of them me be NULL handles
+// scratch - An optional contiguous buffer to read compressed blocks into
+void BlockBasedTable::RetrieveMultipleBlocks(
+    const ReadOptions& options, const MultiGetRange* batch,
+    const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+    autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+    autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>* results,
+    char* scratch, const UncompressionDict& uncompression_dict) const {
+  RandomAccessFileReader* file = rep_->file.get();
+  const Footer& footer = rep_->footer;
+  const ImmutableCFOptions& ioptions = rep_->ioptions;
+  SequenceNumber global_seqno = rep_->get_global_seqno(BlockType::kData);
+  size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
+  MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
+
+  if (file->use_direct_io() || ioptions.allow_mmap_reads) {
+    size_t idx_in_batch = 0;
+    for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+         ++mget_iter, ++idx_in_batch) {
+      BlockCacheLookupContext lookup_data_block_context(
+          TableReaderCaller::kUserMultiGet);
+      const BlockHandle& handle = (*handles)[idx_in_batch];
+      if (handle.IsNull()) {
+        continue;
+      }
+
+      (*statuses)[idx_in_batch] =
+          RetrieveBlock(nullptr, options, handle, uncompression_dict,
+                        &(*results)[idx_in_batch], BlockType::kData,
+                        mget_iter->get_context, &lookup_data_block_context,
+                        /* for_compaction */ false, /* use_cache */ true);
+    }
+    return;
+  }
+
+  autovector<ReadRequest, MultiGetContext::MAX_BATCH_SIZE> read_reqs;
+  size_t buf_offset = 0;
+  size_t idx_in_batch = 0;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    ReadRequest req;
+    req.len = block_size(handle);
+    if (scratch == nullptr) {
+      req.scratch = new char[req.len];
+    } else {
+      req.scratch = scratch + buf_offset;
+      buf_offset += req.len;
+    }
+    req.offset = handle.offset();
+    req.status = Status::OK();
+    read_reqs.emplace_back(req);
+  }
+
+  file->MultiRead(&read_reqs[0], read_reqs.size());
+
+  size_t read_req_idx = 0;
+  idx_in_batch = 0;
+  for (auto mget_iter = batch->begin(); mget_iter != batch->end();
+       ++mget_iter, ++idx_in_batch) {
+    const BlockHandle& handle = (*handles)[idx_in_batch];
+
+    if (handle.IsNull()) {
+      continue;
+    }
+
+    ReadRequest& req = read_reqs[read_req_idx++];
+    Status s = req.status;
+    if (s.ok()) {
+      if (req.result.size() != req.len) {
+        s = Status::Corruption("truncated block read from " +
+                               rep_->file->file_name() + " offset " +
+                               ToString(handle.offset()) + ", expected " +
+                               ToString(req.len) +
+                               " bytes, got " + ToString(req.result.size()));
+      }
+    }
+
+    BlockContents raw_block_contents;
+    if (s.ok()) {
+      if (scratch == nullptr) {
+        // We allocated a buffer for this block. Give ownership of it to
+        // BlockContents so it can free the memory
+        assert(req.result.data() == req.scratch);
+        std::unique_ptr<char[]> raw_block(req.scratch);
+        raw_block_contents = BlockContents(std::move(raw_block), handle.size());
+      } else {
+        // We used the scratch buffer, so no need to free anything
+        raw_block_contents = BlockContents(Slice(req.scratch, handle.size()));
+      }
+#ifndef NDEBUG
+      raw_block_contents.is_raw_block = true;
+#endif
+      if (options.verify_checksums) {
+        PERF_TIMER_GUARD(block_checksum_time);
+        const char* data = req.result.data();
+        uint32_t expected = DecodeFixed32(data + handle.size() + 1);
+        s = rocksdb::VerifyChecksum(footer.checksum(), req.result.data(),
+                                    handle.size() + 1, expected);
+      }
+    }
+    if (s.ok()) {
+      if (options.fill_cache) {
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        CachableEntry<Block>* block_entry = &(*results)[idx_in_batch];
+        // MaybeReadBlockAndLoadToCache will insert into the block caches if
+        // necessary. Since we're passing the raw block contents, it will
+        // avoid looking up the block cache
+        s = MaybeReadBlockAndLoadToCache(
+            nullptr, options, handle, uncompression_dict, block_entry,
+            BlockType::kData, mget_iter->get_context,
+            &lookup_data_block_context, &raw_block_contents);
+
+        // block_entry value could be null if no block cache is present, i.e
+        // BlockBasedTableOptions::no_block_cache is true and no compressed
+        // block cache is configured. In that case, fall
+        // through and set up the block explicitly
+        if (block_entry->GetValue() != nullptr) {
+          continue;
+        }
+      }
+
+      CompressionType compression_type =
+          raw_block_contents.get_compression_type();
+      BlockContents contents;
+      if (compression_type != kNoCompression) {
+        UncompressionContext context(compression_type);
+        UncompressionInfo info(context, uncompression_dict, compression_type);
+        s = UncompressBlockContents(info, req.result.data(), handle.size(),
+                                    &contents, footer.version(),
+                                    rep_->ioptions, memory_allocator);
+      } else {
+        if (scratch != nullptr) {
+          // If we used the scratch buffer, then the contents need to be
+          // copied to heap
+          Slice raw = Slice(req.result.data(), handle.size());
+          contents = BlockContents(
+              CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
+              handle.size());
+        } else {
+          contents = std::move(raw_block_contents);
+        }
+      }
+      if (s.ok()) {
+        (*results)[idx_in_batch].SetOwnedValue(
+            new Block(std::move(contents), global_seqno,
+                      read_amp_bytes_per_bit, ioptions.statistics));
+      }
+    }
+    (*statuses)[idx_in_batch] = s;
+  }
+}
+
+template <typename TBlocklike>
+Status BlockBasedTable::RetrieveBlock(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const {
+  assert(block_entry);
+  assert(block_entry->IsEmpty());
+
+  Status s;
+  if (use_cache) {
+    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
+                                     uncompression_dict, block_entry,
+                                     block_type, get_context, lookup_context,
+                                     /*contents=*/nullptr);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (block_entry->GetValue() != nullptr) {
+      assert(s.ok());
+      return s;
+    }
+  }
+
+  assert(block_entry->IsEmpty());
+
+  const bool no_io = ro.read_tier == kBlockCacheTier;
+  if (no_io) {
+    return Status::Incomplete("no blocking io");
+  }
+
+  const bool maybe_compressed =
+      block_type != BlockType::kFilter &&
+      block_type != BlockType::kCompressionDictionary &&
+      rep_->blocks_maybe_compressed;
+  const bool do_uncompress = maybe_compressed;
+  std::unique_ptr<TBlocklike> block;
+
+  {
+    StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
+                 READ_BLOCK_GET_MICROS);
+    s = ReadBlockFromFile(
+        rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
+        rep_->ioptions, do_uncompress, maybe_compressed, block_type,
+        uncompression_dict, rep_->persistent_cache_options,
+        rep_->get_global_seqno(block_type),
+        block_type == BlockType::kData
+            ? rep_->table_options.read_amp_bytes_per_bit
+            : 0,
+        GetMemoryAllocator(rep_->table_options), for_compaction,
+        rep_->blocks_definitely_zstd_compressed,
+        rep_->table_options.filter_policy.get());
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  block_entry->SetOwnedValue(block.release());
+
+  assert(s.ok());
+  return s;
+}
+
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template Status BlockBasedTable::RetrieveBlock<BlockContents>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<BlockContents>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
+template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
+template Status BlockBasedTable::RetrieveBlock<Block>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<Block>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
+template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
+    FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    bool for_compaction, bool use_cache) const;
+
+BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
+    const BlockBasedTable* table,
+    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
+    : table_(table), block_map_(block_map) {}
+
+InternalIteratorBase<IndexValue>*
+BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
+    const BlockHandle& handle) {
+  // Return a block iterator on the index partition
+  auto block = block_map_->find(handle.offset());
+  // This is a possible scenario since block cache might not have had space
+  // for the partition
+  if (block != block_map_->end()) {
+    const Rep* rep = table_->get_rep();
+    assert(rep);
+
+    Statistics* kNullStats = nullptr;
+    // We don't return pinned data from index blocks, so no need
+    // to set `block_contents_pinned`.
+    return block->second.GetValue()->NewIndexIterator(
+        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+        nullptr, kNullStats, true, rep->index_has_first_key,
+        rep->index_key_includes_seq, rep->index_value_is_full);
+  }
+  // Create an empty iterator
+  return new IndexBlockIter();
+}
+
+// This will be broken if the user specifies an unusual implementation
+// of Options.comparator, or if the user specifies an unusual
+// definition of prefixes in BlockBasedTableOptions.filter_policy.
+// In particular, we require the following three properties:
+//
+// 1) key.starts_with(prefix(key))
+// 2) Compare(prefix(key), key) <= 0.
+// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
+//
+// Otherwise, this method guarantees no I/O will be incurred.
+//
+// REQUIRES: this method shouldn't be called while the DB lock is held.
+bool BlockBasedTable::PrefixMayMatch(
+    const Slice& internal_key, const ReadOptions& read_options,
+    const SliceTransform* options_prefix_extractor,
+    const bool need_upper_bound_check,
+    BlockCacheLookupContext* lookup_context) const {
+  if (!rep_->filter_policy) {
+    return true;
+  }
+
+  const SliceTransform* prefix_extractor;
+
+  if (rep_->table_prefix_extractor == nullptr) {
+    if (need_upper_bound_check) {
+      return true;
+    }
+    prefix_extractor = options_prefix_extractor;
+  } else {
+    prefix_extractor = rep_->table_prefix_extractor.get();
+  }
+  auto user_key = ExtractUserKey(internal_key);
+  if (!prefix_extractor->InDomain(user_key)) {
+    return true;
+  }
+
+  bool may_match = true;
+  Status s;
+
+  // First, try check with full filter
+  FilterBlockReader* const filter = rep_->filter.get();
+  bool filter_checked = true;
+  if (filter != nullptr) {
+    if (!filter->IsBlockBased()) {
+      const Slice* const const_ikey_ptr = &internal_key;
+      may_match = filter->RangeMayExist(
+          read_options.iterate_upper_bound, user_key, prefix_extractor,
+          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
+          &filter_checked, need_upper_bound_check, lookup_context);
+    } else {
+      // if prefix_extractor changed for block based filter, skip filter
+      if (need_upper_bound_check) {
+        return true;
+      }
+      auto prefix = prefix_extractor->Transform(user_key);
+      InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
+      auto internal_prefix = internal_key_prefix.Encode();
+
+      // To prevent any io operation in this method, we set `read_tier` to make
+      // sure we always read index or filter only when they have already been
+      // loaded to memory.
+      ReadOptions no_io_read_options;
+      no_io_read_options.read_tier = kBlockCacheTier;
+
+      // Then, try find it within each block
+      // we already know prefix_extractor and prefix_extractor_name must match
+      // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
+      std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
+          no_io_read_options,
+          /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+          /*get_context=*/nullptr, lookup_context));
+      iiter->Seek(internal_prefix);
+
+      if (!iiter->Valid()) {
+        // we're past end of file
+        // if it's incomplete, it means that we avoided I/O
+        // and we're not really sure that we're past the end
+        // of the file
+        may_match = iiter->status().IsIncomplete();
+      } else if ((rep_->index_key_includes_seq ? ExtractUserKey(iiter->key())
+                                               : iiter->key())
+                     .starts_with(ExtractUserKey(internal_prefix))) {
+        // we need to check for this subtle case because our only
+        // guarantee is that "the key is a string >= last key in that data
+        // block" according to the doc/table_format.txt spec.
+        //
+        // Suppose iiter->key() starts with the desired prefix; it is not
+        // necessarily the case that the corresponding data block will
+        // contain the prefix, since iiter->key() need not be in the
+        // block.  However, the next data block may contain the prefix, so
+        // we return true to play it safe.
+        may_match = true;
+      } else if (filter->IsBlockBased()) {
+        // iiter->key() does NOT start with the desired prefix.  Because
+        // Seek() finds the first key that is >= the seek target, this
+        // means that iiter->key() > prefix.  Thus, any data blocks coming
+        // after the data block corresponding to iiter->key() cannot
+        // possibly contain the key.  Thus, the corresponding data block
+        // is the only on could potentially contain the prefix.
+        BlockHandle handle = iiter->value().handle;
+        may_match = filter->PrefixMayMatch(
+            prefix, prefix_extractor, handle.offset(), /*no_io=*/false,
+            /*const_key_ptr=*/nullptr, /*get_context=*/nullptr, lookup_context);
+      }
+    }
+  }
+
+  if (filter_checked) {
+    Statistics* statistics = rep_->ioptions.statistics;
+    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
+    if (!may_match) {
+      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
+    }
+  }
+
+  return may_match;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
+  SeekImpl(&target);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
+  SeekImpl(nullptr);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekImpl(
+    const Slice* target) {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  if (target && !CheckPrefixMayMatch(*target)) {
+    ResetDataIter();
+    return;
+  }
+
+  bool need_seek_index = true;
+  if (block_iter_points_to_real_block_ && block_iter_.Valid()) {
+    // Reseek.
+    prev_block_offset_ = index_iter_->value().handle.offset();
+
+    if (target) {
+      // We can avoid an index seek if:
+      // 1. The new seek key is larger than the current key
+      // 2. The new seek key is within the upper bound of the block
+      // Since we don't necessarily know the internal key for either
+      // the current key or the upper bound, we check user keys and
+      // exclude the equality case. Considering internal keys can
+      // improve for the boundary cases, but it would complicate the
+      // code.
+      if (user_comparator_.Compare(ExtractUserKey(*target),
+                                   block_iter_.user_key()) > 0 &&
+          user_comparator_.Compare(ExtractUserKey(*target),
+                                   index_iter_->user_key()) < 0) {
+        need_seek_index = false;
+      }
+    }
+  }
+
+  if (need_seek_index) {
+    if (target) {
+      index_iter_->Seek(*target);
+    } else {
+      index_iter_->SeekToFirst();
+    }
+
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  IndexValue v = index_iter_->value();
+  const bool same_block = block_iter_points_to_real_block_ &&
+                          v.handle.offset() == prev_block_offset_;
+
+  // TODO(kolmike): Remove the != kBlockCacheTier condition.
+  if (!v.first_internal_key.empty() && !same_block &&
+      (!target || icomp_.Compare(*target, v.first_internal_key) <= 0) &&
+      read_options_.read_tier != kBlockCacheTier) {
+    // Index contains the first key of the block, and it's >= target.
+    // We can defer reading the block.
+    is_at_first_key_from_index_ = true;
+    // ResetDataIter() will invalidate block_iter_. Thus, there is no need to
+    // call CheckDataBlockWithinUpperBound() to check for iterate_upper_bound
+    // as that will be done later when the data block is actually read.
+    ResetDataIter();
+  } else {
+    // Need to use the data block.
+    if (!same_block) {
+      InitDataBlock();
+    } else {
+      // When the user does a reseek, the iterate_upper_bound might have
+      // changed. CheckDataBlockWithinUpperBound() needs to be called
+      // explicitly if the reseek ends up in the same data block.
+      // If the reseek ends up in a different block, InitDataBlock() will do
+      // the iterator upper bound check.
+      CheckDataBlockWithinUpperBound();
+    }
+
+    if (target) {
+      block_iter_.Seek(*target);
+    } else {
+      block_iter_.SeekToFirst();
+    }
+    FindKeyForward();
+  }
+
+  CheckOutOfBound();
+
+  if (target) {
+    assert(!Valid() || ((block_type_ == BlockType::kIndex &&
+                         !table_->get_rep()->index_key_includes_seq)
+                            ? (user_comparator_.Compare(ExtractUserKey(*target),
+                                                        key()) <= 0)
+                            : (icomp_.Compare(*target, key()) <= 0)));
+  }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
+    const Slice& target) {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  if (!CheckPrefixMayMatch(target)) {
+    ResetDataIter();
+    return;
+  }
+
+  SavePrevIndexValue();
+
+  // Call Seek() rather than SeekForPrev() in the index block, because the
+  // target data block will likely to contain the position for `target`, the
+  // same as Seek(), rather than than before.
+  // For example, if we have three data blocks, each containing two keys:
+  //   [2, 4]  [6, 8] [10, 12]
+  //  (the keys in the index block would be [4, 8, 12])
+  // and the user calls SeekForPrev(7), we need to go to the second block,
+  // just like if they call Seek(7).
+  // The only case where the block is difference is when they seek to a position
+  // in the boundary. For example, if they SeekForPrev(5), we should go to the
+  // first block, rather than the second. However, we don't have the information
+  // to distinguish the two unless we read the second block. In this case, we'll
+  // end up with reading two blocks.
+  index_iter_->Seek(target);
+
+  if (!index_iter_->Valid()) {
+    if (!index_iter_->status().ok()) {
+      ResetDataIter();
+      return;
+    }
+
+    index_iter_->SeekToLast();
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      return;
+    }
+  }
+
+  InitDataBlock();
+
+  block_iter_.SeekForPrev(target);
+
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+  assert(!block_iter_.Valid() ||
+         icomp_.Compare(target, block_iter_.key()) >= 0);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
+  is_out_of_bound_ = false;
+  is_at_first_key_from_index_ = false;
+  SavePrevIndexValue();
+  index_iter_->SeekToLast();
+  if (!index_iter_->Valid()) {
+    ResetDataIter();
+    return;
+  }
+  InitDataBlock();
+  block_iter_.SeekToLast();
+  FindKeyBackward();
+  CheckDataBlockWithinUpperBound();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
+  if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
+    return;
+  }
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Next();
+  FindKeyForward();
+  CheckOutOfBound();
+}
+
+template <class TBlockIter, typename TValue>
+bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
+    IterateResult* result) {
+  Next();
+  bool is_valid = Valid();
+  if (is_valid) {
+    result->key = key();
+    result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+  }
+  return is_valid;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
+  if (is_at_first_key_from_index_) {
+    is_at_first_key_from_index_ = false;
+
+    index_iter_->Prev();
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToLast();
+  } else {
+    assert(block_iter_points_to_real_block_);
+    block_iter_.Prev();
+  }
+
+  FindKeyBackward();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
+  BlockHandle data_block_handle = index_iter_->value().handle;
+  if (!block_iter_points_to_real_block_ ||
+      data_block_handle.offset() != prev_block_offset_ ||
+      // if previous attempt of reading the block missed cache, try again
+      block_iter_.status().IsIncomplete()) {
+    if (block_iter_points_to_real_block_) {
+      ResetDataIter();
+    }
+    auto* rep = table_->get_rep();
+
+    // Prefetch additional data for range scans (iterators). Enabled only for
+    // user reads.
+    // Implicit auto readahead:
+    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
+    // Explicit user requested readahead:
+    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
+    if (lookup_context_.caller != TableReaderCaller::kCompaction) {
+      if (read_options_.readahead_size == 0) {
+        // Implicit auto readahead
+        num_file_reads_++;
+        if (num_file_reads_ >
+            BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
+          if (!rep->file->use_direct_io() &&
+              (data_block_handle.offset() +
+                   static_cast<size_t>(block_size(data_block_handle)) >
+               readahead_limit_)) {
+            // Buffered I/O
+            // Discarding the return status of Prefetch calls intentionally, as
+            // we can fallback to reading from disk if Prefetch fails.
+            rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
+            readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
+                                                   readahead_size_);
+            // Keep exponentially increasing readahead size until
+            // kMaxAutoReadaheadSize.
+            readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
+                                       readahead_size_ * 2);
+          } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
+            // Direct I/O
+            // Let FilePrefetchBuffer take care of the readahead.
+            prefetch_buffer_.reset(new FilePrefetchBuffer(
+                rep->file.get(), BlockBasedTable::kInitAutoReadaheadSize,
+                BlockBasedTable::kMaxAutoReadaheadSize));
+          }
+        }
+      } else if (!prefetch_buffer_) {
+        // Explicit user requested readahead
+        // The actual condition is:
+        // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
+        prefetch_buffer_.reset(new FilePrefetchBuffer(
+            rep->file.get(), read_options_.readahead_size,
+            read_options_.readahead_size));
+      }
+    } else if (!prefetch_buffer_) {
+      prefetch_buffer_.reset(
+          new FilePrefetchBuffer(rep->file.get(), compaction_readahead_size_,
+                                 compaction_readahead_size_));
+    }
+
+    Status s;
+    table_->NewDataBlockIterator<TBlockIter>(
+        read_options_, data_block_handle, &block_iter_, block_type_,
+        /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
+        /*for_compaction=*/lookup_context_.caller ==
+            TableReaderCaller::kCompaction);
+    block_iter_points_to_real_block_ = true;
+    CheckDataBlockWithinUpperBound();
+  }
+}
+
+template <class TBlockIter, typename TValue>
+bool BlockBasedTableIterator<TBlockIter, TValue>::MaterializeCurrentBlock() {
+  assert(is_at_first_key_from_index_);
+  assert(!block_iter_points_to_real_block_);
+  assert(index_iter_->Valid());
+
+  is_at_first_key_from_index_ = false;
+  InitDataBlock();
+  assert(block_iter_points_to_real_block_);
+  block_iter_.SeekToFirst();
+
+  if (!block_iter_.Valid() ||
+      icomp_.Compare(block_iter_.key(),
+                     index_iter_->value().first_internal_key) != 0) {
+    // Uh oh.
+    block_iter_.Invalidate(Status::Corruption(
+        "first key in index doesn't match first key in block"));
+    return false;
+  }
+
+  return true;
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
+  // This method's code is kept short to make it likely to be inlined.
+
+  assert(!is_out_of_bound_);
+  assert(block_iter_points_to_real_block_);
+
+  if (!block_iter_.Valid()) {
+    // This is the only call site of FindBlockForward(), but it's extracted into
+    // a separate method to keep FindKeyForward() short and likely to be
+    // inlined. When transitioning to a different block, we call
+    // FindBlockForward(), which is much longer and is probably not inlined.
+    FindBlockForward();
+  } else {
+    // This is the fast path that avoids a function call.
+  }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
+  // TODO the while loop inherits from two-level-iterator. We don't know
+  // whether a block can be empty so it can be replaced by an "if".
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+    // Whether next data block is out of upper bound, if there is one.
+    const bool next_block_is_out_of_bound =
+        read_options_.iterate_upper_bound != nullptr &&
+        block_iter_points_to_real_block_ && !data_block_within_upper_bound_;
+    assert(!next_block_is_out_of_bound ||
+           user_comparator_.Compare(*read_options_.iterate_upper_bound,
+                                    index_iter_->user_key()) <= 0);
+    ResetDataIter();
+    index_iter_->Next();
+    if (next_block_is_out_of_bound) {
+      // The next block is out of bound. No need to read it.
+      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
+      // We need to make sure this is not the last data block before setting
+      // is_out_of_bound_, since the index key for the last data block can be
+      // larger than smallest key of the next file on the same level.
+      if (index_iter_->Valid()) {
+        is_out_of_bound_ = true;
+      }
+      return;
+    }
+
+    if (!index_iter_->Valid()) {
+      return;
+    }
+
+    IndexValue v = index_iter_->value();
+
+    // TODO(kolmike): Remove the != kBlockCacheTier condition.
+    if (!v.first_internal_key.empty() &&
+        read_options_.read_tier != kBlockCacheTier) {
+      // Index contains the first key of the block. Defer reading the block.
+      is_at_first_key_from_index_ = true;
+      return;
+    }
+
+    InitDataBlock();
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    ResetDataIter();
+    index_iter_->Prev();
+
+    if (index_iter_->Valid()) {
+      InitDataBlock();
+      block_iter_.SeekToLast();
+    } else {
+      return;
+    }
+  }
+
+  // We could have check lower bound here too, but we opt not to do it for
+  // code simplicity.
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
+  if (read_options_.iterate_upper_bound != nullptr && Valid()) {
+    is_out_of_bound_ = user_comparator_.Compare(
+                           *read_options_.iterate_upper_bound, user_key()) <= 0;
+  }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter,
+                             TValue>::CheckDataBlockWithinUpperBound() {
+  if (read_options_.iterate_upper_bound != nullptr &&
+      block_iter_points_to_real_block_) {
+    data_block_within_upper_bound_ =
+        (user_comparator_.Compare(*read_options_.iterate_upper_bound,
+                                  index_iter_->user_key()) > 0);
+  }
+}
+
+InternalIterator* BlockBasedTable::NewIterator(
+    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+    Arena* arena, bool skip_filters, TableReaderCaller caller,
+    size_t compaction_readahead_size) {
+  BlockCacheLookupContext lookup_context{caller};
+  bool need_upper_bound_check =
+      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
+  if (arena == nullptr) {
+    return new BlockBasedTableIterator<DataBlockIter>(
+        this, read_options, rep_->internal_comparator,
+        NewIndexIterator(
+            read_options,
+            need_upper_bound_check &&
+                rep_->index_type == BlockBasedTableOptions::kHashSearch,
+            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
+        compaction_readahead_size);
+  } else {
+    auto* mem =
+        arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
+    return new (mem) BlockBasedTableIterator<DataBlockIter>(
+        this, read_options, rep_->internal_comparator,
+        NewIndexIterator(
+            read_options,
+            need_upper_bound_check &&
+                rep_->index_type == BlockBasedTableOptions::kHashSearch,
+            /*input_iter=*/nullptr, /*get_context=*/nullptr, &lookup_context),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, BlockType::kData, caller,
+        compaction_readahead_size);
+  }
+}
+
+FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options) {
+  if (rep_->fragmented_range_dels == nullptr) {
+    return nullptr;
+  }
+  SequenceNumber snapshot = kMaxSequenceNumber;
+  if (read_options.snapshot != nullptr) {
+    snapshot = read_options.snapshot->GetSequenceNumber();
+  }
+  return new FragmentedRangeTombstoneIterator(
+      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
+}
+
+bool BlockBasedTable::FullFilterKeyMayMatch(
+    const ReadOptions& read_options, FilterBlockReader* filter,
+    const Slice& internal_key, const bool no_io,
+    const SliceTransform* prefix_extractor, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
+  if (filter == nullptr || filter->IsBlockBased()) {
+    return true;
+  }
+  Slice user_key = ExtractUserKey(internal_key);
+  const Slice* const const_ikey_ptr = &internal_key;
+  bool may_match = true;
+  if (rep_->whole_key_filtering) {
+    size_t ts_sz =
+        rep_->internal_comparator.user_comparator()->timestamp_size();
+    Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
+    may_match =
+        filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
+                            no_io, const_ikey_ptr, get_context, lookup_context);
+  } else if (!read_options.total_order_seek && prefix_extractor &&
+             rep_->table_properties->prefix_extractor_name.compare(
+                 prefix_extractor->Name()) == 0 &&
+             prefix_extractor->InDomain(user_key) &&
+             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
+                                     prefix_extractor, kNotValid, no_io,
+                                     const_ikey_ptr, get_context,
+                                     lookup_context)) {
+    may_match = false;
+  }
+  if (may_match) {
+    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
+  }
+  return may_match;
+}
+
+void BlockBasedTable::FullFilterKeysMayMatch(
+    const ReadOptions& read_options, FilterBlockReader* filter,
+    MultiGetRange* range, const bool no_io,
+    const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context) const {
+  if (filter == nullptr || filter->IsBlockBased()) {
+    return;
+  }
+  if (rep_->whole_key_filtering) {
+    filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io,
+                         lookup_context);
+  } else if (!read_options.total_order_seek && prefix_extractor &&
+             rep_->table_properties->prefix_extractor_name.compare(
+                 prefix_extractor->Name()) == 0) {
+    filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
+                             lookup_context);
+  }
+}
+
+Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
+                            GetContext* get_context,
+                            const SliceTransform* prefix_extractor,
+                            bool skip_filters) {
+  assert(key.size() >= 8);  // key must be internal key
+  assert(get_context != nullptr);
+  Status s;
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  uint64_t tracing_get_id = get_context->get_tracing_get_id();
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserGet, tracing_get_id,
+      /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+    // Trace the key since it contains both user key and sequence number.
+    lookup_context.referenced_key = key.ToString();
+    lookup_context.get_from_user_specified_snapshot =
+        read_options.snapshot != nullptr;
+  }
+  const bool may_match =
+      FullFilterKeyMayMatch(read_options, filter, key, no_io, prefix_extractor,
+                            get_context, &lookup_context);
+  if (!may_match) {
+    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
+  } else {
+    IndexBlockIter iiter_on_stack;
+    // if prefix_extractor found in block differs from options, disable
+    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+    bool need_upper_bound_check = false;
+    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+      need_upper_bound_check = PrefixExtractorChanged(
+          rep_->table_properties.get(), prefix_extractor);
+    }
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         get_context, &lookup_context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    size_t ts_sz =
+        rep_->internal_comparator.user_comparator()->timestamp_size();
+    bool matched = false;  // if such user key mathced a key in SST
+    bool done = false;
+    for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
+      IndexValue v = iiter->value();
+
+      bool not_exist_in_filter =
+          filter != nullptr && filter->IsBlockBased() == true &&
+          !filter->KeyMayMatch(ExtractUserKeyAndStripTimestamp(key, ts_sz),
+                               prefix_extractor, v.handle.offset(), no_io,
+                               /*const_ikey_ptr=*/nullptr, get_context,
+                               &lookup_context);
+
+      if (not_exist_in_filter) {
+        // Not found
+        // TODO: think about interaction with Merge. If a user key cannot
+        // cross one data block, we should be fine.
+        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
+        break;
+      }
+
+      if (!v.first_internal_key.empty() && !skip_filters &&
+          UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                  .Compare(ExtractUserKey(key),
+                           ExtractUserKey(v.first_internal_key)) < 0) {
+        // The requested key falls between highest key in previous block and
+        // lowest key in current block.
+        break;
+      }
+
+      BlockCacheLookupContext lookup_data_block_context{
+          TableReaderCaller::kUserGet, tracing_get_id,
+          /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+              nullptr};
+      bool does_referenced_key_exist = false;
+      DataBlockIter biter;
+      uint64_t referenced_data_size = 0;
+      NewDataBlockIterator<DataBlockIter>(
+          read_options, v.handle, &biter, BlockType::kData, get_context,
+          &lookup_data_block_context,
+          /*s=*/Status(), /*prefetch_buffer*/ nullptr);
+
+      if (no_io && biter.status().IsIncomplete()) {
+        // couldn't get block from block_cache
+        // Update Saver.state to Found because we are only looking for
+        // whether we can guarantee the key is not there when "no_io" is set
+        get_context->MarkKeyMayExist();
+        break;
+      }
+      if (!biter.status().ok()) {
+        s = biter.status();
+        break;
+      }
+
+      bool may_exist = biter.SeekForGet(key);
+      // If user-specified timestamp is supported, we cannot end the search
+      // just because hash index lookup indicates the key+ts does not exist.
+      if (!may_exist && ts_sz == 0) {
+        // HashSeek cannot find the key this block and the the iter is not
+        // the end of the block, i.e. cannot be in the following blocks
+        // either. In this case, the seek_key cannot be found, so we break
+        // from the top level for-loop.
+        done = true;
+      } else {
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter.Valid(); biter.Next()) {
+          ParsedInternalKey parsed_key;
+          if (!ParseInternalKey(biter.key(), &parsed_key)) {
+            s = Status::Corruption(Slice());
+          }
+
+          if (!get_context->SaveValue(
+                  parsed_key, biter.value(), &matched,
+                  biter.IsValuePinned() ? &biter : nullptr)) {
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size = biter.key().size() + biter.value().size();
+            }
+            done = true;
+            break;
+          }
+        }
+        s = biter.status();
+      }
+      // Write the block cache access record.
+      if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+        // Avoid making copy of block_key, cf_name, and referenced_key when
+        // constructing the access record.
+        Slice referenced_key;
+        if (does_referenced_key_exist) {
+          referenced_key = biter.key();
+        } else {
+          referenced_key = key;
+        }
+        BlockCacheTraceRecord access_record(
+            rep_->ioptions.env->NowMicros(),
+            /*block_key=*/"", lookup_data_block_context.block_type,
+            lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+            /*cf_name=*/"", rep_->level_for_tracing(),
+            rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+            lookup_data_block_context.is_cache_hit,
+            lookup_data_block_context.no_insert,
+            lookup_data_block_context.get_id,
+            lookup_data_block_context.get_from_user_specified_snapshot,
+            /*referenced_key=*/"", referenced_data_size,
+            lookup_data_block_context.num_keys_in_block,
+            does_referenced_key_exist);
+        block_cache_tracer_->WriteBlockAccess(
+            access_record, lookup_data_block_context.block_key,
+            rep_->cf_name_for_tracing(), referenced_key);
+      }
+
+      if (done) {
+        // Avoid the extra Next which is expensive in two-level indexes
+        break;
+      }
+    }
+    if (matched && filter != nullptr && !filter->IsBlockBased()) {
+      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                rep_->level);
+    }
+    if (s.ok()) {
+      s = iiter->status();
+    }
+  }
+
+  return s;
+}
+
+using MultiGetRange = MultiGetContext::Range;
+void BlockBasedTable::MultiGet(const ReadOptions& read_options,
+                               const MultiGetRange* mget_range,
+                               const SliceTransform* prefix_extractor,
+                               bool skip_filters) {
+  FilterBlockReader* const filter =
+      !skip_filters ? rep_->filter.get() : nullptr;
+  MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
+                               mget_range->end());
+
+  // First check the full filter
+  // If full filter not useful, Then go into each block
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+  if (!sst_file_range.empty() && sst_file_range.begin()->get_context) {
+    tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id();
+  }
+  BlockCacheLookupContext lookup_context{
+      TableReaderCaller::kUserMultiGet, tracing_mget_id,
+      /*get_from_user_specified_snapshot=*/read_options.snapshot != nullptr};
+  FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
+                         prefix_extractor, &lookup_context);
+
+  if (skip_filters || !sst_file_range.empty()) {
+    IndexBlockIter iiter_on_stack;
+    // if prefix_extractor found in block differs from options, disable
+    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+    bool need_upper_bound_check = false;
+    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+      need_upper_bound_check = PrefixExtractorChanged(
+          rep_->table_properties.get(), prefix_extractor);
+    }
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         sst_file_range.begin()->get_context, &lookup_context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    uint64_t offset = std::numeric_limits<uint64_t>::max();
+    autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE> block_handles;
+    autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE> results;
+    autovector<Status, MultiGetContext::MAX_BATCH_SIZE> statuses;
+    char stack_buf[kMultiGetReadStackBufSize];
+    std::unique_ptr<char[]> block_buf;
+    {
+      MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
+                                     sst_file_range.end());
+
+      CachableEntry<UncompressionDict> uncompression_dict;
+      Status uncompression_dict_status;
+      if (rep_->uncompression_dict_reader) {
+        uncompression_dict_status =
+            rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+                nullptr /* prefetch_buffer */, no_io,
+                sst_file_range.begin()->get_context, &lookup_context,
+                &uncompression_dict);
+      }
+
+      const UncompressionDict& dict = uncompression_dict.GetValue()
+                                          ? *uncompression_dict.GetValue()
+                                          : UncompressionDict::GetEmptyDict();
+
+      size_t total_len = 0;
+      ReadOptions ro = read_options;
+      ro.read_tier = kBlockCacheTier;
+
+      for (auto miter = data_block_range.begin();
+           miter != data_block_range.end(); ++miter) {
+        const Slice& key = miter->ikey;
+        iiter->Seek(miter->ikey);
+
+        IndexValue v;
+        if (iiter->Valid()) {
+          v = iiter->value();
+        }
+        if (!iiter->Valid() ||
+            (!v.first_internal_key.empty() && !skip_filters &&
+             UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                     .Compare(ExtractUserKey(key),
+                              ExtractUserKey(v.first_internal_key)) < 0)) {
+          // The requested key falls between highest key in previous block and
+          // lowest key in current block.
+          *(miter->s) = iiter->status();
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+
+        if (!uncompression_dict_status.ok()) {
+          *(miter->s) = uncompression_dict_status;
+          data_block_range.SkipKey(miter);
+          sst_file_range.SkipKey(miter);
+          continue;
+        }
+
+        statuses.emplace_back();
+        results.emplace_back();
+        if (v.handle.offset() == offset) {
+          // We're going to reuse the block for this key later on. No need to
+          // look it up now. Place a null handle
+          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          continue;
+        }
+        // Lookup the cache for the given data block referenced by an index
+        // iterator value (i.e BlockHandle). If it exists in the cache,
+        // initialize block to the contents of the data block.
+        offset = v.handle.offset();
+        BlockHandle handle = v.handle;
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet);
+        Status s = RetrieveBlock(
+            nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
+            miter->get_context, &lookup_data_block_context,
+            /* for_compaction */ false, /* use_cache */ true);
+        if (s.IsIncomplete()) {
+          s = Status::OK();
+        }
+        if (s.ok() && !results.back().IsEmpty()) {
+          // Found it in the cache. Add NULL handle to indicate there is
+          // nothing to read from disk
+          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+        } else {
+          block_handles.emplace_back(handle);
+          total_len += block_size(handle);
+        }
+      }
+
+      if (total_len) {
+        char* scratch = nullptr;
+        // If the blocks need to be uncompressed and we don't need the
+        // compressed blocks, then we can use a contiguous block of
+        // memory to read in all the blocks as it will be temporary
+        // storage
+        // 1. If blocks are compressed and compressed block cache is there,
+        //    alloc heap bufs
+        // 2. If blocks are uncompressed, alloc heap bufs
+        // 3. If blocks are compressed and no compressed block cache, use
+        //    stack buf
+        if (rep_->table_options.block_cache_compressed == nullptr &&
+            rep_->blocks_maybe_compressed) {
+          if (total_len <= kMultiGetReadStackBufSize) {
+            scratch = stack_buf;
+          } else {
+            scratch = new char[total_len];
+            block_buf.reset(scratch);
+          }
+        }
+        RetrieveMultipleBlocks(read_options, &data_block_range, &block_handles,
+                               &statuses, &results, scratch, dict);
+      }
+    }
+
+    DataBlockIter first_biter;
+    DataBlockIter next_biter;
+    size_t idx_in_batch = 0;
+    for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
+         ++miter) {
+      Status s;
+      GetContext* get_context = miter->get_context;
+      const Slice& key = miter->ikey;
+      bool matched = false;  // if such user key matched a key in SST
+      bool done = false;
+      bool first_block = true;
+      do {
+        DataBlockIter* biter = nullptr;
+        bool reusing_block = true;
+        uint64_t referenced_data_size = 0;
+        bool does_referenced_key_exist = false;
+        BlockCacheLookupContext lookup_data_block_context(
+            TableReaderCaller::kUserMultiGet, tracing_mget_id,
+            /*get_from_user_specified_snapshot=*/read_options.snapshot !=
+                nullptr);
+        if (first_block) {
+          if (!block_handles[idx_in_batch].IsNull() ||
+              !results[idx_in_batch].IsEmpty()) {
+            first_biter.Invalidate(Status::OK());
+            NewDataBlockIterator<DataBlockIter>(
+                read_options, results[idx_in_batch], &first_biter,
+                statuses[idx_in_batch]);
+            reusing_block = false;
+          }
+          biter = &first_biter;
+          idx_in_batch++;
+        } else {
+          IndexValue v = iiter->value();
+          if (!v.first_internal_key.empty() && !skip_filters &&
+              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
+                      .Compare(ExtractUserKey(key),
+                               ExtractUserKey(v.first_internal_key)) < 0) {
+            // The requested key falls between highest key in previous block and
+            // lowest key in current block.
+            break;
+          }
+
+          next_biter.Invalidate(Status::OK());
+          NewDataBlockIterator<DataBlockIter>(
+              read_options, iiter->value().handle, &next_biter,
+              BlockType::kData, get_context, &lookup_data_block_context,
+              Status(), nullptr);
+          biter = &next_biter;
+          reusing_block = false;
+        }
+
+        if (read_options.read_tier == kBlockCacheTier &&
+            biter->status().IsIncomplete()) {
+          // couldn't get block from block_cache
+          // Update Saver.state to Found because we are only looking for
+          // whether we can guarantee the key is not there when "no_io" is set
+          get_context->MarkKeyMayExist();
+          break;
+        }
+        if (!biter->status().ok()) {
+          s = biter->status();
+          break;
+        }
+
+        bool may_exist = biter->SeekForGet(key);
+        if (!may_exist) {
+          // HashSeek cannot find the key this block and the the iter is not
+          // the end of the block, i.e. cannot be in the following blocks
+          // either. In this case, the seek_key cannot be found, so we break
+          // from the top level for-loop.
+          break;
+        }
+
+        // Call the *saver function on each entry/block until it returns false
+        for (; biter->Valid(); biter->Next()) {
+          ParsedInternalKey parsed_key;
+          Cleanable dummy;
+          Cleanable* value_pinner = nullptr;
+          if (!ParseInternalKey(biter->key(), &parsed_key)) {
+            s = Status::Corruption(Slice());
+          }
+          if (biter->IsValuePinned()) {
+            if (reusing_block) {
+              Cache* block_cache = rep_->table_options.block_cache.get();
+              assert(biter->cache_handle() != nullptr);
+              block_cache->Ref(biter->cache_handle());
+              dummy.RegisterCleanup(&ReleaseCachedEntry, block_cache,
+                                    biter->cache_handle());
+              value_pinner = &dummy;
+            } else {
+              value_pinner = biter;
+            }
+          }
+          if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
+                                      value_pinner)) {
+            if (get_context->State() == GetContext::GetState::kFound) {
+              does_referenced_key_exist = true;
+              referenced_data_size =
+                  biter->key().size() + biter->value().size();
+            }
+            done = true;
+            break;
+          }
+          s = biter->status();
+        }
+        // Write the block cache access.
+        if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) {
+          // Avoid making copy of block_key, cf_name, and referenced_key when
+          // constructing the access record.
+          Slice referenced_key;
+          if (does_referenced_key_exist) {
+            referenced_key = biter->key();
+          } else {
+            referenced_key = key;
+          }
+          BlockCacheTraceRecord access_record(
+              rep_->ioptions.env->NowMicros(),
+              /*block_key=*/"", lookup_data_block_context.block_type,
+              lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
+              /*cf_name=*/"", rep_->level_for_tracing(),
+              rep_->sst_number_for_tracing(), lookup_data_block_context.caller,
+              lookup_data_block_context.is_cache_hit,
+              lookup_data_block_context.no_insert,
+              lookup_data_block_context.get_id,
+              lookup_data_block_context.get_from_user_specified_snapshot,
+              /*referenced_key=*/"", referenced_data_size,
+              lookup_data_block_context.num_keys_in_block,
+              does_referenced_key_exist);
+          block_cache_tracer_->WriteBlockAccess(
+              access_record, lookup_data_block_context.block_key,
+              rep_->cf_name_for_tracing(), referenced_key);
+        }
+        s = biter->status();
+        if (done) {
+          // Avoid the extra Next which is expensive in two-level indexes
+          break;
+        }
+        if (first_block) {
+          iiter->Seek(key);
+        }
+        first_block = false;
+        iiter->Next();
+      } while (iiter->Valid());
+
+      if (matched && filter != nullptr && !filter->IsBlockBased()) {
+        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                  rep_->level);
+      }
+      if (s.ok()) {
+        s = iiter->status();
+      }
+      *(miter->s) = s;
+    }
+  }
+}
+
+Status BlockBasedTable::Prefetch(const Slice* const begin,
+                                 const Slice* const end) {
+  auto& comparator = rep_->internal_comparator;
+  UserComparatorWrapper user_comparator(comparator.user_comparator());
+  // pre-condition
+  if (begin && end && comparator.Compare(*begin, *end) > 0) {
+    return Status::InvalidArgument(*begin, *end);
+  }
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+  IndexBlockIter iiter_on_stack;
+  auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                                &iiter_on_stack, /*get_context=*/nullptr,
+                                &lookup_context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+
+  if (!iiter->status().ok()) {
+    // error opening index iterator
+    return iiter->status();
+  }
+
+  // indicates if we are on the last page that need to be pre-fetched
+  bool prefetching_boundary_page = false;
+
+  for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
+       iiter->Next()) {
+    BlockHandle block_handle = iiter->value().handle;
+    const bool is_user_key = !rep_->index_key_includes_seq;
+    if (end &&
+        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
+         (is_user_key &&
+          user_comparator.Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
+      if (prefetching_boundary_page) {
+        break;
+      }
+
+      // The index entry represents the last key in the data block.
+      // We should load this page into memory as well, but no more
+      prefetching_boundary_page = true;
+    }
+
+    // Load the block specified by the block_handle into the block cache
+    DataBlockIter biter;
+
+    NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, &lookup_context, Status(),
+        /*prefetch_buffer=*/nullptr);
+
+    if (!biter.status().ok()) {
+      // there was an unexpected error while pre-fetching
+      return biter.status();
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
+                                       TableReaderCaller caller) {
+  Status s;
+  // Check Meta blocks
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  s = ReadMetaIndexBlock(nullptr /* prefetch buffer */, &metaindex,
+                         &metaindex_iter);
+  if (s.ok()) {
+    s = VerifyChecksumInMetaBlocks(metaindex_iter.get());
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    return s;
+  }
+  // Check Data blocks
+  IndexBlockIter iiter_on_stack;
+  BlockCacheLookupContext context{caller};
+  InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
+      read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
+      /*get_context=*/nullptr, &context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (iiter != &iiter_on_stack) {
+    iiter_unique_ptr = std::unique_ptr<InternalIteratorBase<IndexValue>>(iiter);
+  }
+  if (!iiter->status().ok()) {
+    // error opening index iterator
+    return iiter->status();
+  }
+  s = VerifyChecksumInBlocks(read_options, iiter);
+  return s;
+}
+
+Status BlockBasedTable::VerifyChecksumInBlocks(
+    const ReadOptions& read_options,
+    InternalIteratorBase<IndexValue>* index_iter) {
+  Status s;
+  // We are scanning the whole file, so no need to do exponential
+  // increasing of the buffer size.
+  size_t readahead_size = (read_options.readahead_size != 0)
+                              ? read_options.readahead_size
+                              : kMaxAutoReadaheadSize;
+  // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
+  // needed there.
+  FilePrefetchBuffer prefetch_buffer(
+      rep_->file.get(), readahead_size /* readadhead_size */,
+      readahead_size /* max_readahead_size */,
+      !rep_->ioptions.allow_mmap_reads /* enable */);
+
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    BlockHandle handle = index_iter->value().handle;
+    BlockContents contents;
+    BlockFetcher block_fetcher(
+        rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
+        &contents, rep_->ioptions, false /* decompress */,
+        false /*maybe_compressed*/, BlockType::kData,
+        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+    s = block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
+    const Slice& meta_block_name) {
+  if (meta_block_name.starts_with(kFilterBlockPrefix) ||
+      meta_block_name.starts_with(kFullFilterBlockPrefix) ||
+      meta_block_name.starts_with(kPartitionedFilterBlockPrefix)) {
+    return BlockType::kFilter;
+  }
+
+  if (meta_block_name == kPropertiesBlock) {
+    return BlockType::kProperties;
+  }
+
+  if (meta_block_name == kCompressionDictBlock) {
+    return BlockType::kCompressionDictionary;
+  }
+
+  if (meta_block_name == kRangeDelBlock) {
+    return BlockType::kRangeDeletion;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesBlock) {
+    return BlockType::kHashIndexPrefixes;
+  }
+
+  if (meta_block_name == kHashIndexPrefixesMetadataBlock) {
+    return BlockType::kHashIndexMetadata;
+  }
+
+  assert(false);
+  return BlockType::kInvalid;
+}
+
+Status BlockBasedTable::VerifyChecksumInMetaBlocks(
+    InternalIteratorBase<Slice>* index_iter) {
+  Status s;
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    BlockHandle handle;
+    Slice input = index_iter->value();
+    s = handle.DecodeFrom(&input);
+    BlockContents contents;
+    const Slice meta_block_name = index_iter->key();
+    BlockFetcher block_fetcher(
+        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+        ReadOptions(), handle, &contents, rep_->ioptions,
+        false /* decompress */, false /*maybe_compressed*/,
+        GetBlockTypeForMetaBlockByName(meta_block_name),
+        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+    s = block_fetcher.ReadBlockContents();
+    if (s.IsCorruption() && meta_block_name == kPropertiesBlock) {
+      TableProperties* table_properties;
+      s = TryReadPropertiesWithGlobalSeqno(nullptr /* prefetch_buffer */,
+                                           index_iter->value(),
+                                           &table_properties);
+      delete table_properties;
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
+  assert(rep_ != nullptr);
+
+  Cache* const cache = rep_->table_options.block_cache.get();
+  if (cache == nullptr) {
+    return false;
+  }
+
+  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  Slice cache_key =
+      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
+                  cache_key_storage);
+
+  Cache::Handle* const cache_handle = cache->Lookup(cache_key);
+  if (cache_handle == nullptr) {
+    return false;
+  }
+
+  cache->Release(cache_handle);
+
+  return true;
+}
+
+bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
+                                      const Slice& key) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
+      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+      /*get_context=*/nullptr, /*lookup_context=*/nullptr));
+  iiter->Seek(key);
+  assert(iiter->Valid());
+
+  return TEST_BlockInCache(iiter->value().handle);
+}
+
+// REQUIRES: The following fields of rep_ should have already been populated:
+//  1. file
+//  2. index_handle,
+//  3. options
+//  4. internal_comparator
+//  5. index_type
+Status BlockBasedTable::CreateIndexReader(
+    FilePrefetchBuffer* prefetch_buffer,
+    InternalIterator* preloaded_meta_index_iter, bool use_cache, bool prefetch,
+    bool pin, BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<IndexReader>* index_reader) {
+  // kHashSearch requires non-empty prefix_extractor but bypass checking
+  // prefix_extractor here since we have no access to MutableCFOptions.
+  // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
+  // If prefix_extractor does not match prefix_extractor_name from table
+  // properties, turn off Hash Index by setting total_order_seek to true
+
+  switch (rep_->index_type) {
+    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
+      return PartitionIndexReader::Create(this, prefetch_buffer, use_cache,
+                                          prefetch, pin, lookup_context,
+                                          index_reader);
+    }
+    case BlockBasedTableOptions::kBinarySearch:
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+      return BinarySearchIndexReader::Create(this, prefetch_buffer, use_cache,
+                                             prefetch, pin, lookup_context,
+                                             index_reader);
+    }
+    case BlockBasedTableOptions::kHashSearch: {
+      std::unique_ptr<Block> metaindex_guard;
+      std::unique_ptr<InternalIterator> metaindex_iter_guard;
+      auto meta_index_iter = preloaded_meta_index_iter;
+      if (meta_index_iter == nullptr) {
+        auto s = ReadMetaIndexBlock(prefetch_buffer, &metaindex_guard,
+                                    &metaindex_iter_guard);
+        if (!s.ok()) {
+          // we simply fall back to binary search in case there is any
+          // problem with prefix hash index loading.
+          ROCKS_LOG_WARN(rep_->ioptions.info_log,
+                         "Unable to read the metaindex block."
+                         " Fall back to binary search index.");
+          return BinarySearchIndexReader::Create(this, prefetch_buffer,
+                                                 use_cache, prefetch, pin,
+                                                 lookup_context, index_reader);
+        }
+        meta_index_iter = metaindex_iter_guard.get();
+      }
+
+      return HashIndexReader::Create(this, prefetch_buffer, meta_index_iter,
+                                     use_cache, prefetch, pin, lookup_context,
+                                     index_reader);
+    }
+    default: {
+      std::string error_message =
+          "Unrecognized index type: " + ToString(rep_->index_type);
+      return Status::InvalidArgument(error_message.c_str());
+    }
+  }
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(
+    const InternalIteratorBase<IndexValue>& index_iter) const {
+  uint64_t result = 0;
+  if (index_iter.Valid()) {
+    BlockHandle handle = index_iter.value().handle;
+    result = handle.offset();
+  } else {
+    // The iterator is past the last key in the file. If table_properties is not
+    // available, approximate the offset by returning the offset of the
+    // metaindex block (which is right near the end of the file).
+    if (rep_->table_properties) {
+      result = rep_->table_properties->data_size;
+    }
+    // table_properties is not present in the table.
+    if (result == 0) {
+      result = rep_->footer.metaindex_handle().offset();
+    }
+  }
+
+  return result;
+}
+
+uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
+                                              TableReaderCaller caller) {
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  auto index_iter =
+      NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (index_iter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(index_iter);
+  }
+
+  index_iter->Seek(key);
+  return ApproximateOffsetOf(*index_iter);
+}
+
+uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
+                                          TableReaderCaller caller) {
+  assert(rep_->internal_comparator.Compare(start, end) <= 0);
+
+  BlockCacheLookupContext context(caller);
+  IndexBlockIter iiter_on_stack;
+  auto index_iter =
+      NewIndexIterator(ReadOptions(), /*disable_prefix_seek=*/false,
+                       /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr,
+                       /*lookup_context=*/&context);
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+  if (index_iter != &iiter_on_stack) {
+    iiter_unique_ptr.reset(index_iter);
+  }
+
+  index_iter->Seek(start);
+  uint64_t start_offset = ApproximateOffsetOf(*index_iter);
+  index_iter->Seek(end);
+  uint64_t end_offset = ApproximateOffsetOf(*index_iter);
+
+  assert(end_offset >= start_offset);
+  return end_offset - start_offset;
+}
+
+bool BlockBasedTable::TEST_FilterBlockInCache() const {
+  assert(rep_ != nullptr);
+  return TEST_BlockInCache(rep_->filter_handle);
+}
+
+bool BlockBasedTable::TEST_IndexBlockInCache() const {
+  assert(rep_ != nullptr);
+
+  return TEST_BlockInCache(rep_->footer.index_handle());
+}
+
+Status BlockBasedTable::GetKVPairsFromDataBlocks(
+    std::vector<KVPairBlock>* kv_pair_blocks) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    // Cannot read Index Block
+    return s;
+  }
+
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+
+    if (!s.ok()) {
+      break;
+    }
+
+    std::unique_ptr<InternalIterator> datablock_iter;
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
+        /*prefetch_buffer=*/nullptr));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      // Error reading the block - Skipped
+      continue;
+    }
+
+    KVPairBlock kv_pair_block;
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        // Error reading the block - Skipped
+        break;
+      }
+      const Slice& key = datablock_iter->key();
+      const Slice& value = datablock_iter->value();
+      std::string key_copy = std::string(key.data(), key.size());
+      std::string value_copy = std::string(value.data(), value.size());
+
+      kv_pair_block.push_back(
+          std::make_pair(std::move(key_copy), std::move(value_copy)));
+    }
+    kv_pair_blocks->push_back(std::move(kv_pair_block));
+  }
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+  // Output Footer
+  out_file->Append(
+      "Footer Details:\n"
+      "--------------------------------------\n"
+      "  ");
+  out_file->Append(rep_->footer.ToString().c_str());
+  out_file->Append("\n");
+
+  // Output MetaIndex
+  out_file->Append(
+      "Metaindex Details:\n"
+      "--------------------------------------\n");
+  std::unique_ptr<Block> metaindex;
+  std::unique_ptr<InternalIterator> metaindex_iter;
+  Status s = ReadMetaIndexBlock(nullptr /* prefetch_buffer */, &metaindex,
+                                &metaindex_iter);
+  if (s.ok()) {
+    for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+         metaindex_iter->Next()) {
+      s = metaindex_iter->status();
+      if (!s.ok()) {
+        return s;
+      }
+      if (metaindex_iter->key() == rocksdb::kPropertiesBlock) {
+        out_file->Append("  Properties block handle: ");
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      } else if (metaindex_iter->key() == rocksdb::kCompressionDictBlock) {
+        out_file->Append("  Compression dictionary block handle: ");
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      } else if (strstr(metaindex_iter->key().ToString().c_str(),
+                        "filter.rocksdb.") != nullptr) {
+        out_file->Append("  Filter block handle: ");
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      } else if (metaindex_iter->key() == rocksdb::kRangeDelBlock) {
+        out_file->Append("  Range deletion block handle: ");
+        out_file->Append(metaindex_iter->value().ToString(true).c_str());
+        out_file->Append("\n");
+      }
+    }
+    out_file->Append("\n");
+  } else {
+    return s;
+  }
+
+  // Output TableProperties
+  const rocksdb::TableProperties* table_properties;
+  table_properties = rep_->table_properties.get();
+
+  if (table_properties != nullptr) {
+    out_file->Append(
+        "Table Properties:\n"
+        "--------------------------------------\n"
+        "  ");
+    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
+    out_file->Append("\n");
+  }
+
+  if (rep_->filter) {
+    out_file->Append(
+        "Filter Details:\n"
+        "--------------------------------------\n"
+        "  ");
+    out_file->Append(rep_->filter->ToString().c_str());
+    out_file->Append("\n");
+  }
+
+  // Output Index block
+  s = DumpIndexBlock(out_file);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Output compression dictionary
+  if (rep_->uncompression_dict_reader) {
+    CachableEntry<UncompressionDict> uncompression_dict;
+    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        nullptr /* prefetch_buffer */, false /* no_io */,
+        nullptr /* get_context */, nullptr /* lookup_context */,
+        &uncompression_dict);
+    if (!s.ok()) {
+      return s;
+    }
+
+    assert(uncompression_dict.GetValue());
+
+    const Slice& raw_dict = uncompression_dict.GetValue()->GetRawDict();
+    out_file->Append(
+        "Compression Dictionary:\n"
+        "--------------------------------------\n");
+    out_file->Append("  size (bytes): ");
+    out_file->Append(rocksdb::ToString(raw_dict.size()));
+    out_file->Append("\n\n");
+    out_file->Append("  HEX    ");
+    out_file->Append(raw_dict.ToString(true).c_str());
+    out_file->Append("\n\n");
+  }
+
+  // Output range deletions block
+  auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
+  if (range_del_iter != nullptr) {
+    range_del_iter->SeekToFirst();
+    if (range_del_iter->Valid()) {
+      out_file->Append(
+          "Range deletions:\n"
+          "--------------------------------------\n"
+          "  ");
+      for (; range_del_iter->Valid(); range_del_iter->Next()) {
+        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
+      }
+      out_file->Append("\n");
+    }
+    delete range_del_iter;
+  }
+  // Output Data blocks
+  s = DumpDataBlocks(out_file);
+
+  return s;
+}
+
+Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
+  out_file->Append(
+      "Index Details:\n"
+      "--------------------------------------\n");
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_file->Append("Can not read Index Block \n\n");
+    return s;
+  }
+
+  out_file->Append("  Block key hex dump: Data block handle\n");
+  out_file->Append("  Block key ascii\n\n");
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+    Slice key = blockhandles_iter->key();
+    Slice user_key;
+    InternalKey ikey;
+    if (!rep_->index_key_includes_seq) {
+      user_key = key;
+    } else {
+      ikey.DecodeFrom(key);
+      user_key = ikey.user_key();
+    }
+
+    out_file->Append("  HEX    ");
+    out_file->Append(user_key.ToString(true).c_str());
+    out_file->Append(": ");
+    out_file->Append(blockhandles_iter->value()
+                         .ToString(true, rep_->index_has_first_key)
+                         .c_str());
+    out_file->Append("\n");
+
+    std::string str_key = user_key.ToString();
+    std::string res_key("");
+    char cspace = ' ';
+    for (size_t i = 0; i < str_key.size(); i++) {
+      res_key.append(&str_key[i], 1);
+      res_key.append(1, cspace);
+    }
+    out_file->Append("  ASCII  ");
+    out_file->Append(res_key.c_str());
+    out_file->Append("\n  ------\n");
+  }
+  out_file->Append("\n");
+  return Status::OK();
+}
+
+Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
+  std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
+      NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false,
+                       /*input_iter=*/nullptr, /*get_context=*/nullptr,
+                       /*lookup_contex=*/nullptr));
+  Status s = blockhandles_iter->status();
+  if (!s.ok()) {
+    out_file->Append("Can not read Index Block \n\n");
+    return s;
+  }
+
+  uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
+  uint64_t datablock_size_max = 0;
+  uint64_t datablock_size_sum = 0;
+
+  size_t block_id = 1;
+  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
+       block_id++, blockhandles_iter->Next()) {
+    s = blockhandles_iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    BlockHandle bh = blockhandles_iter->value().handle;
+    uint64_t datablock_size = bh.size();
+    datablock_size_min = std::min(datablock_size_min, datablock_size);
+    datablock_size_max = std::max(datablock_size_max, datablock_size);
+    datablock_size_sum += datablock_size;
+
+    out_file->Append("Data Block # ");
+    out_file->Append(rocksdb::ToString(block_id));
+    out_file->Append(" @ ");
+    out_file->Append(blockhandles_iter->value().handle.ToString(true).c_str());
+    out_file->Append("\n");
+    out_file->Append("--------------------------------------\n");
+
+    std::unique_ptr<InternalIterator> datablock_iter;
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        ReadOptions(), blockhandles_iter->value().handle,
+        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*get_context=*/nullptr, /*lookup_context=*/nullptr, Status(),
+        /*prefetch_buffer=*/nullptr));
+    s = datablock_iter->status();
+
+    if (!s.ok()) {
+      out_file->Append("Error reading the block - Skipped \n\n");
+      continue;
+    }
+
+    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
+         datablock_iter->Next()) {
+      s = datablock_iter->status();
+      if (!s.ok()) {
+        out_file->Append("Error reading the block - Skipped \n");
+        break;
+      }
+      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
+    }
+    out_file->Append("\n");
+  }
+
+  uint64_t num_datablocks = block_id - 1;
+  if (num_datablocks) {
+    double datablock_size_avg =
+        static_cast<double>(datablock_size_sum) / num_datablocks;
+    out_file->Append("Data Block Summary:\n");
+    out_file->Append("--------------------------------------");
+    out_file->Append("\n  # data blocks: ");
+    out_file->Append(rocksdb::ToString(num_datablocks));
+    out_file->Append("\n  min data block size: ");
+    out_file->Append(rocksdb::ToString(datablock_size_min));
+    out_file->Append("\n  max data block size: ");
+    out_file->Append(rocksdb::ToString(datablock_size_max));
+    out_file->Append("\n  avg data block size: ");
+    out_file->Append(rocksdb::ToString(datablock_size_avg));
+    out_file->Append("\n");
+  }
+
+  return Status::OK();
+}
+
+void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
+                                   WritableFile* out_file) {
+  InternalKey ikey;
+  ikey.DecodeFrom(key);
+
+  out_file->Append("  HEX    ");
+  out_file->Append(ikey.user_key().ToString(true).c_str());
+  out_file->Append(": ");
+  out_file->Append(value.ToString(true).c_str());
+  out_file->Append("\n");
+
+  std::string str_key = ikey.user_key().ToString();
+  std::string str_value = value.ToString();
+  std::string res_key(""), res_value("");
+  char cspace = ' ';
+  for (size_t i = 0; i < str_key.size(); i++) {
+    if (str_key[i] == '\0') {
+      res_key.append("\\0", 2);
+    } else {
+      res_key.append(&str_key[i], 1);
+    }
+    res_key.append(1, cspace);
+  }
+  for (size_t i = 0; i < str_value.size(); i++) {
+    if (str_value[i] == '\0') {
+      res_value.append("\\0", 2);
+    } else {
+      res_value.append(&str_value[i], 1);
+    }
+    res_value.append(1, cspace);
+  }
+
+  out_file->Append("  ASCII  ");
+  out_file->Append(res_key.c_str());
+  out_file->Append(": ");
+  out_file->Append(res_value.c_str());
+  out_file->Append("\n  ------\n");
+}
+
+}  // namespace rocksdb
diff --git a/table/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
similarity index 55%
rename from table/block_based_table_reader.h
rename to table/block_based/block_based_table_reader.h
index 1fcc8cbfa07..bcc7fd1a3da 100644
--- a/table/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -17,15 +17,20 @@
 #include <vector>
 
 #include "db/range_tombstone_fragmenter.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
-#include "table/block_based_table_factory.h"
-#include "table/filter_block.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+#include "table/block_based/uncompression_dict_reader.h"
 #include "table/format.h"
 #include "table/get_context.h"
 #include "table/multiget_context.h"
@@ -33,13 +38,12 @@
 #include "table/table_properties_internal.h"
 #include "table/table_reader.h"
 #include "table/two_level_iterator.h"
+#include "trace_replay/block_cache_tracer.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
 
-class BlockHandle;
 class Cache;
 class FilterBlockReader;
 class BlockBasedFilterBlockReader;
@@ -58,9 +62,17 @@ class GetContext;
 
 typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
 
-// A Table is a sorted map from strings to strings.  Tables are
-// immutable and persistent.  A Table may be safely accessed from
-// multiple threads without external synchronization.
+// Reader class for BlockBasedTable format.
+// For the format of BlockBasedTable refer to
+// https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
+// This is the default table type. Data is chucked into fixed size blocks and
+// each block in-turn stores entries. When storing data, we can compress and/or
+// encode data efficiently within a block, which often results in a much smaller
+// data size compared with the raw data size. As for the record retrieval, we'll
+// first locate the block where target record may reside, then read the block to
+// memory, and finally search that record within the block. Of course, to avoid
+// frequent reads of the same block, we introduced the block cache to keep the
+// loaded blocks in the memory.
 class BlockBasedTable : public TableReader {
  public:
   static const std::string kFilterBlockPrefix;
@@ -70,6 +82,13 @@ class BlockBasedTable : public TableReader {
   // For Posix files the unique ID is three varints.
   static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
 
+  // All the below fields control iterator readahead
+  static const size_t kInitAutoReadaheadSize = 8 * 1024;
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  static const size_t kMaxAutoReadaheadSize;
+  static const int kMinNumFileReadsToStartAutoReadahead = 2;
+
   // Attempt to open the table that is stored in bytes [0..file_size)
   // of "file", and read the metadata entries necessary to allow
   // retrieving data from the table.
@@ -98,22 +117,26 @@ class BlockBasedTable : public TableReader {
                      bool skip_filters = false, int level = -1,
                      const bool immortal_table = false,
                      const SequenceNumber largest_seqno = 0,
-                     TailPrefetchStats* tail_prefetch_stats = nullptr);
+                     TailPrefetchStats* tail_prefetch_stats = nullptr,
+                     BlockCacheTracer* const block_cache_tracer = nullptr);
 
   bool PrefixMayMatch(const Slice& internal_key,
                       const ReadOptions& read_options,
                       const SliceTransform* options_prefix_extractor,
-                      const bool need_upper_bound_check);
+                      const bool need_upper_bound_check,
+                      BlockCacheLookupContext* lookup_context) const;
 
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
   // @param skip_filters Disables loading/accessing the filter block
+  // compaction_readahead_size: its value will only be used if caller =
+  // kCompaction.
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
-                                Arena* arena = nullptr,
-                                bool skip_filters = false,
-                                bool for_compaction = false) override;
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0) override;
 
   FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& read_options) override;
@@ -135,11 +158,21 @@ class BlockBasedTable : public TableReader {
 
   // Given a key, return an approximate byte offset in the file where
   // the data for that key begins (or would begin if the key were
-  // present in the file).  The returned value is in terms of file
+  // present in the file). The returned value is in terms of file
   // bytes, and so includes effects like compression of the underlying data.
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
-  uint64_t ApproximateOffsetOf(const Slice& key) override;
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               TableReaderCaller caller) override;
+
+  // Given start and end keys, return the approximate data size in the file
+  // between the keys. The returned value is in terms of file bytes, and so
+  // includes effects like compression of the underlying data.
+  // The start key must not be greater than the end key.
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
+
+  bool TEST_BlockInCache(const BlockHandle& handle) const;
 
   // Returns true if the block for the specified key is in cache.
   // REQUIRES: key is in this table && block cache enabled
@@ -154,64 +187,44 @@ class BlockBasedTable : public TableReader {
   size_t ApproximateMemoryUsage() const override;
 
   // convert SST file to a human readable form
-  Status DumpTable(WritableFile* out_file,
-                   const SliceTransform* prefix_extractor = nullptr) override;
+  Status DumpTable(WritableFile* out_file) override;
 
-  Status VerifyChecksum() override;
-
-  void Close() override;
+  Status VerifyChecksum(const ReadOptions& readOptions,
+                        TableReaderCaller caller) override;
 
   ~BlockBasedTable();
 
-  bool TEST_filter_block_preloaded() const;
-  bool TEST_index_reader_preloaded() const;
+  bool TEST_FilterBlockInCache() const;
+  bool TEST_IndexBlockInCache() const;
 
-  // IndexReader is the interface that provide the functionality for index
+  // IndexReader is the interface that provides the functionality for index
   // access.
   class IndexReader {
    public:
-    explicit IndexReader(const InternalKeyComparator* icomparator,
-                         Statistics* stats)
-        : icomparator_(icomparator), statistics_(stats) {}
-
-    virtual ~IndexReader() {}
-
-    // Create an iterator for index access.
-    // If iter is null then a new object is created on heap and the callee will
-    // have the ownership. If a non-null iter is passed in it will be used, and
-    // the returned value is either the same as iter or a new on-heap object
-    // that
-    // wrapps the passed iter. In the latter case the return value would point
-    // to
-    // a different object then iter and the callee has the ownership of the
+    virtual ~IndexReader() = default;
+
+    // Create an iterator for index access. If iter is null, then a new object
+    // is created on the heap, and the callee will have the ownership.
+    // If a non-null iter is passed in, it will be used, and the returned value
+    // is either the same as iter or a new on-heap object that
+    // wraps the passed iter. In the latter case the return value points
+    // to a different object then iter, and the callee has the ownership of the
     // returned object.
-    virtual InternalIteratorBase<BlockHandle>* NewIterator(
-        IndexBlockIter* iter = nullptr, bool total_order_seek = true,
-        bool fill_cache = true) = 0;
-
-    // The size of the index.
-    virtual size_t size() const = 0;
-    // Memory usage of the index block
-    virtual size_t usable_size() const = 0;
-    // return the statistics pointer
-    virtual Statistics* statistics() const { return statistics_; }
+    virtual InternalIteratorBase<IndexValue>* NewIterator(
+        const ReadOptions& read_options, bool disable_prefix_seek,
+        IndexBlockIter* iter, GetContext* get_context,
+        BlockCacheLookupContext* lookup_context) = 0;
+
     // Report an approximation of how much memory has been used other than
-    // memory
-    // that was allocated in block cache.
+    // memory that was allocated in block cache.
     virtual size_t ApproximateMemoryUsage() const = 0;
-
-    virtual void CacheDependencies(bool /* unused */) {}
-
-    // Prefetch all the blocks referenced by this index to the buffer
-    void PrefetchBlocks(FilePrefetchBuffer* buf);
-
-   protected:
-    const InternalKeyComparator* icomparator_;
-
-   private:
-    Statistics* statistics_;
+    // Cache the dependencies of the index reader (e.g. the partitions
+    // of a partitioned index).
+    virtual void CacheDependencies(bool /* pin */) {}
   };
 
+  class IndexReaderCommon;
+
   static Slice GetCacheKey(const char* cache_key_prefix,
                            size_t cache_key_prefix_size,
                            const BlockHandle& handle, char* cache_key);
@@ -220,39 +233,62 @@ class BlockBasedTable : public TableReader {
   // The key retrieved are internal keys.
   Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
 
-  template <class TValue>
-  struct CachableEntry;
   struct Rep;
 
   Rep* get_rep() { return rep_; }
+  const Rep* get_rep() const { return rep_; }
 
   // input_iter: if it is not null, update this one and return it as Iterator
   template <typename TBlockIter>
-  static TBlockIter* NewDataBlockIterator(
-      Rep* rep, const ReadOptions& ro, const Slice& index_value,
-      TBlockIter* input_iter = nullptr, bool is_index = false,
-      bool key_includes_seq = true, bool index_key_is_full = true,
-      GetContext* get_context = nullptr,
-      FilePrefetchBuffer* prefetch_buffer = nullptr);
+  TBlockIter* NewDataBlockIterator(
+      const ReadOptions& ro, const BlockHandle& block_handle,
+      TBlockIter* input_iter, BlockType block_type, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context, Status s,
+      FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
+
+  // input_iter: if it is not null, update this one and return it as Iterator
   template <typename TBlockIter>
-  static TBlockIter* NewDataBlockIterator(
-      Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde,
-      TBlockIter* input_iter = nullptr, bool is_index = false,
-      bool key_includes_seq = true, bool index_key_is_full = true,
-      GetContext* get_context = nullptr, Status s = Status(),
-      FilePrefetchBuffer* prefetch_buffer = nullptr);
+  TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
+                                   CachableEntry<Block>& block,
+                                   TBlockIter* input_iter, Status s) const;
 
   class PartitionedIndexIteratorState;
 
+  template <typename TBlocklike>
+  friend class FilterBlockReaderCommon;
+
   friend class PartitionIndexReader;
 
+  friend class UncompressionDictReader;
+
  protected:
   Rep* rep_;
-  explicit BlockBasedTable(Rep* rep) : rep_(rep) {}
+  explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
+      : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
+  // No copying allowed
+  explicit BlockBasedTable(const TableReader&) = delete;
+  void operator=(const TableReader&) = delete;
 
  private:
   friend class MockedBlockBasedTable;
   static std::atomic<uint64_t> next_cache_key_id_;
+  BlockCacheTracer* const block_cache_tracer_;
+
+  void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
+                             size_t usage) const;
+  void UpdateCacheMissMetrics(BlockType block_type,
+                              GetContext* get_context) const;
+  void UpdateCacheInsertionMetrics(BlockType block_type,
+                                   GetContext* get_context, size_t usage) const;
+  Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
+                                   BlockType block_type,
+                                   GetContext* get_context) const;
+
+  // Either Block::NewDataIterator() or Block::NewIndexIterator().
+  template <typename TBlockIter>
+  static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
+                                       TBlockIter* input_iter,
+                                       bool block_contents_pinned);
 
   // If block cache enabled (compressed or uncompressed), looks for the block
   // identified by handle in (1) uncompressed cache, (2) compressed cache, and
@@ -263,31 +299,40 @@ class BlockBasedTable : public TableReader {
   // @param block_entry value is set to the uncompressed block if found. If
   //    in uncompressed block cache, also sets cache_handle to reference that
   //    block.
-  static Status MaybeReadBlockAndLoadToCache(
-      FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
+  template <typename TBlocklike>
+  Status MaybeReadBlockAndLoadToCache(
+      FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-      CachableEntry<Block>* block_entry, bool is_index = false,
-      GetContext* get_context = nullptr);
-
-  // For the following two functions:
-  // if `no_io == true`, we will not try to read filter/index from sst file
-  // were they not present in cache yet.
-  CachableEntry<FilterBlockReader> GetFilter(
-      const SliceTransform* prefix_extractor = nullptr,
-      FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false,
-      GetContext* get_context = nullptr) const;
-  virtual CachableEntry<FilterBlockReader> GetFilter(
-      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
-      const bool is_a_filter_partition, bool no_io, GetContext* get_context,
-      const SliceTransform* prefix_extractor = nullptr) const;
-
-  static CachableEntry<UncompressionDict> GetUncompressionDict(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io,
-      GetContext* get_context);
+      CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      BlockContents* contents) const;
+
+  // Similar to the above, with one crucial difference: it will retrieve the
+  // block from the file even if there are no caches configured (assuming the
+  // read options allow I/O).
+  template <typename TBlocklike>
+  Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
+                       const ReadOptions& ro, const BlockHandle& handle,
+                       const UncompressionDict& uncompression_dict,
+                       CachableEntry<TBlocklike>* block_entry,
+                       BlockType block_type, GetContext* get_context,
+                       BlockCacheLookupContext* lookup_context,
+                       bool for_compaction, bool use_cache) const;
+
+  void RetrieveMultipleBlocks(
+      const ReadOptions& options, const MultiGetRange* batch,
+      const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
+      autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
+      autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
+          results,
+      char* scratch, const UncompressionDict& uncompression_dict) const;
 
   // Get the iterator from the index reader.
-  // If input_iter is not set, return new Iterator
-  // If input_iter is set, update it and return it as Iterator
+  //
+  // If input_iter is not set, return a new Iterator.
+  // If input_iter is set, try to update it and return it as Iterator.
+  // However note that in some cases the returned iterator may be different
+  // from input_iter. In such case the returned iterator should be freed.
   //
   // Note: ErrorIterator with Status::Incomplete shall be returned if all the
   // following conditions are met:
@@ -295,11 +340,10 @@ class BlockBasedTable : public TableReader {
   //  2. index is not present in block cache.
   //  3. We disallowed any io to be performed, that is, read_options ==
   //     kBlockCacheTier
-  InternalIteratorBase<BlockHandle>* NewIndexIterator(
-      const ReadOptions& read_options, bool need_upper_bound_check = false,
-      IndexBlockIter* input_iter = nullptr,
-      CachableEntry<IndexReader>* index_entry = nullptr,
-      GetContext* get_context = nullptr);
+  InternalIteratorBase<IndexValue>* NewIndexIterator(
+      const ReadOptions& read_options, bool need_upper_bound_check,
+      IndexBlockIter* input_iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) const;
 
   // Read block cache from block caches (if set): block_cache and
   // block_cache_compressed.
@@ -307,14 +351,13 @@ class BlockBasedTable : public TableReader {
   // pointer to the block as well as its block handle.
   // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
-  static Status GetDataBlockFromCache(
+  template <typename TBlocklike>
+  Status GetDataBlockFromCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-      Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
-      const ReadOptions& read_options,
-      BlockBasedTable::CachableEntry<Block>* block,
-      const UncompressionDict& uncompression_dict,
-      size_t read_amp_bytes_per_bit, bool is_index = false,
-      GetContext* get_context = nullptr);
+      Cache* block_cache, Cache* block_cache_compressed,
+      const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
+      const UncompressionDict& uncompression_dict, BlockType block_type,
+      GetContext* get_context) const;
 
   // Put a raw block (maybe compressed) to the corresponding block caches.
   // This method will perform decompression against raw_block if needed and then
@@ -326,16 +369,15 @@ class BlockBasedTable : public TableReader {
   // PutDataBlockToCache(). After the call, the object will be invalid.
   // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
-  static Status PutDataBlockToCache(
+  template <typename TBlocklike>
+  Status PutDataBlockToCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed,
-      const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
-      CachableEntry<Block>* block, BlockContents* raw_block_contents,
-      CompressionType raw_block_comp_type, uint32_t format_version,
+      CachableEntry<TBlocklike>* cached_block,
+      BlockContents* raw_block_contents, CompressionType raw_block_comp_type,
       const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
-      size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
-      bool is_index = false, Cache::Priority pri = Cache::Priority::LOW,
-      GetContext* get_context = nullptr);
+      MemoryAllocator* memory_allocator, BlockType block_type,
+      GetContext* get_context) const;
 
   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
   // after a call to Seek(key), until handle_result returns false.
@@ -343,69 +385,65 @@ class BlockBasedTable : public TableReader {
   friend class TableCache;
   friend class BlockBasedTableBuilder;
 
-  void ReadMeta(const Footer& footer);
-
-  // Figure the index type, update it in rep_, and also return it.
-  BlockBasedTableOptions::IndexType UpdateIndexType();
-
   // Create a index reader based on the index type stored in the table.
   // Optionally, user can pass a preloaded meta_index_iter for the index that
   // need to access extra meta blocks for index construction. This parameter
   // helps avoid re-reading meta index block if caller already created one.
-  Status CreateIndexReader(
-      FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
-      InternalIterator* preloaded_meta_index_iter = nullptr,
-      const int level = -1);
-
-  bool FullFilterKeyMayMatch(
-      const ReadOptions& read_options, FilterBlockReader* filter,
-      const Slice& user_key, const bool no_io,
-      const SliceTransform* prefix_extractor = nullptr) const;
-
-  void FullFilterKeysMayMatch(
-      const ReadOptions& read_options, FilterBlockReader* filter,
-      MultiGetRange* range, const bool no_io,
-      const SliceTransform* prefix_extractor = nullptr) const;
+  Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* preloaded_meta_index_iter,
+                           bool use_cache, bool prefetch, bool pin,
+                           BlockCacheLookupContext* lookup_context,
+                           std::unique_ptr<IndexReader>* index_reader);
+
+  bool FullFilterKeyMayMatch(const ReadOptions& read_options,
+                             FilterBlockReader* filter, const Slice& user_key,
+                             const bool no_io,
+                             const SliceTransform* prefix_extractor,
+                             GetContext* get_context,
+                             BlockCacheLookupContext* lookup_context) const;
+
+  void FullFilterKeysMayMatch(const ReadOptions& read_options,
+                              FilterBlockReader* filter, MultiGetRange* range,
+                              const bool no_io,
+                              const SliceTransform* prefix_extractor,
+                              BlockCacheLookupContext* lookup_context) const;
 
   static Status PrefetchTail(
       RandomAccessFileReader* file, uint64_t file_size,
       TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
       const bool preload_all,
       std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
-  static Status ReadMetaBlock(Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-                              std::unique_ptr<Block>* meta_block,
-                              std::unique_ptr<InternalIterator>* iter);
-  static Status TryReadPropertiesWithGlobalSeqno(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
-      TableProperties** table_properties);
-  static Status ReadPropertiesBlock(Rep* rep,
-                                    FilePrefetchBuffer* prefetch_buffer,
-                                    InternalIterator* meta_iter,
-                                    const SequenceNumber largest_seqno);
-  static Status ReadRangeDelBlock(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-      InternalIterator* meta_iter,
-      const InternalKeyComparator& internal_comparator);
-  static Status ReadCompressionDictBlock(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-      std::unique_ptr<const BlockContents>* compression_dict_block);
-  static Status PrefetchIndexAndFilterBlocks(
-      Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-      InternalIterator* meta_iter, BlockBasedTable* new_table,
-      const SliceTransform* prefix_extractor, bool prefetch_all,
+  Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer,
+                            std::unique_ptr<Block>* metaindex_block,
+                            std::unique_ptr<InternalIterator>* iter);
+  Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer,
+                                          const Slice& handle_value,
+                                          TableProperties** table_properties);
+  Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer,
+                             InternalIterator* meta_iter,
+                             const SequenceNumber largest_seqno);
+  Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer,
+                           InternalIterator* meta_iter,
+                           const InternalKeyComparator& internal_comparator,
+                           BlockCacheLookupContext* lookup_context);
+  Status PrefetchIndexAndFilterBlocks(
+      FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+      BlockBasedTable* new_table, bool prefetch_all,
       const BlockBasedTableOptions& table_options, const int level,
-      const bool prefetch_index_and_filter_in_cache);
+      BlockCacheLookupContext* lookup_context);
+
+  static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
 
   Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
-  Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
+  Status VerifyChecksumInBlocks(const ReadOptions& read_options,
+                                InternalIteratorBase<IndexValue>* index_iter);
 
   // Create the filter from the filter block.
-  virtual FilterBlockReader* ReadFilter(
-      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
-      const bool is_a_filter_partition,
-      const SliceTransform* prefix_extractor = nullptr) const;
+  std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
+      FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
+      bool pin, BlockCacheLookupContext* lookup_context);
 
-  static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size);
+  static void SetupCacheKeyPrefix(Rep* rep);
 
   // Generate a cache key prefix from the file
   static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
@@ -413,62 +451,43 @@ class BlockBasedTable : public TableReader {
   static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer,
                                   size_t* size);
 
+  // Given an iterator return its offset in file.
+  uint64_t ApproximateOffsetOf(
+      const InternalIteratorBase<IndexValue>& index_iter) const;
+
   // Helper functions for DumpTable()
   Status DumpIndexBlock(WritableFile* out_file);
   Status DumpDataBlocks(WritableFile* out_file);
   void DumpKeyValue(const Slice& key, const Slice& value,
                     WritableFile* out_file);
 
-  // No copying allowed
-  explicit BlockBasedTable(const TableReader&) = delete;
-  void operator=(const TableReader&) = delete;
+  // A cumulative data block file read in MultiGet lower than this size will
+  // use a stack buffer
+  static constexpr size_t kMultiGetReadStackBufSize = 8192;
 
   friend class PartitionedFilterBlockReader;
   friend class PartitionedFilterBlockTest;
+  friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
 };
 
-// Maitaning state of a two-level iteration on a partitioned index structure
+// Maitaning state of a two-level iteration on a partitioned index structure.
 class BlockBasedTable::PartitionedIndexIteratorState
     : public TwoLevelIteratorState {
  public:
   PartitionedIndexIteratorState(
-      BlockBasedTable* table,
-      std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
-      const bool index_key_includes_seq, const bool index_key_is_full);
-  InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
+      const BlockBasedTable* table,
+      std::unordered_map<uint64_t, CachableEntry<Block>>* block_map);
+  InternalIteratorBase<IndexValue>* NewSecondaryIterator(
       const BlockHandle& index_value) override;
 
  private:
   // Don't own table_
-  BlockBasedTable* table_;
+  const BlockBasedTable* table_;
   std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
-  bool index_key_includes_seq_;
-  bool index_key_is_full_;
-};
-
-// CachableEntry represents the entries that *may* be fetched from block cache.
-//  field `value` is the item we want to get.
-//  field `cache_handle` is the cache handle to the block cache. If the value
-//    was not read from cache, `cache_handle` will be nullptr.
-template <class TValue>
-struct BlockBasedTable::CachableEntry {
-  CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
-      : value(_value), cache_handle(_cache_handle) {}
-  CachableEntry() : CachableEntry(nullptr, nullptr) {}
-  void Release(Cache* cache, bool force_erase = false) {
-    if (cache_handle) {
-      cache->Release(cache_handle, force_erase);
-      value = nullptr;
-      cache_handle = nullptr;
-    }
-  }
-  bool IsSet() const { return cache_handle != nullptr; }
-
-  TValue* value = nullptr;
-  // if the entry is from the cache, cache_handle will be populated.
-  Cache::Handle* cache_handle = nullptr;
 };
 
+// Stores all the properties associated with a BlockBasedTable.
+// These are immutable.
 struct BlockBasedTable::Rep {
   Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
       const BlockBasedTableOptions& _table_opt,
@@ -501,21 +520,14 @@ struct BlockBasedTable::Rep {
   size_t persistent_cache_key_prefix_size = 0;
   char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
   size_t compressed_cache_key_prefix_size = 0;
-  uint64_t dummy_index_reader_offset =
-      0;  // ID that is unique for the block cache.
   PersistentCacheOptions persistent_cache_options;
 
   // Footer contains the fixed table information
   Footer footer;
-  // `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e.,
-  // non-nullptr) and used only when options.block_cache is nullptr or when
-  // `cache_index_and_filter_blocks == false`. Otherwise, we will get the index,
-  // filter, and compression dictionary blocks via the block cache. In that case
-  // `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle`
-  // are used to lookup these meta-blocks in block cache.
+
   std::unique_ptr<IndexReader> index_reader;
   std::unique_ptr<FilterBlockReader> filter;
-  std::unique_ptr<UncompressionDict> uncompression_dict;
+  std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
 
   enum class FilterType {
     kNoFilter,
@@ -539,14 +551,6 @@ struct BlockBasedTable::Rep {
   std::unique_ptr<SliceTransform> internal_prefix_transform;
   std::shared_ptr<const SliceTransform> table_prefix_extractor;
 
-  // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
-  // true or in all levels when pin_top_level_index_and_filter is set in
-  // combination with partitioned index/filters: then we do use the LRU cache,
-  // but we always keep the filter & index block's handle checked out here (=we
-  // don't call Release()), plus the parsed out objects the LRU cache will never
-  // push flush them out, hence they're pinned
-  CachableEntry<FilterBlockReader> filter_entry;
-  CachableEntry<IndexReader> index_entry;
   std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
 
   // If global_seqno is used, all Keys in this file will have the same
@@ -570,26 +574,52 @@ struct BlockBasedTable::Rep {
   // still work, just not as quickly.
   bool blocks_definitely_zstd_compressed = false;
 
-  bool closed = false;
+  // These describe how index is encoded.
+  bool index_has_first_key = false;
+  bool index_key_includes_seq = true;
+  bool index_value_is_full = true;
+
   const bool immortal_table;
 
-  SequenceNumber get_global_seqno(bool is_index) const {
-    return is_index ? kDisableGlobalSequenceNumber : global_seqno;
+  SequenceNumber get_global_seqno(BlockType block_type) const {
+    return (block_type == BlockType::kFilter ||
+            block_type == BlockType::kCompressionDictionary)
+               ? kDisableGlobalSequenceNumber
+               : global_seqno;
+  }
+
+  uint64_t cf_id_for_tracing() const {
+    return table_properties ? table_properties->column_family_id
+                            : rocksdb::TablePropertiesCollectorFactory::
+                                  Context::kUnknownColumnFamily;
+  }
+
+  Slice cf_name_for_tracing() const {
+    return table_properties ? table_properties->column_family_name
+                            : BlockCacheTraceHelper::kUnknownColumnFamilyName;
+  }
+
+  uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
+
+  uint64_t sst_number_for_tracing() const {
+    return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
   }
 };
 
+// Iterates over the contents of BlockBasedTable.
 template <class TBlockIter, typename TValue = Slice>
 class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
  public:
-  BlockBasedTableIterator(BlockBasedTable* table,
+  BlockBasedTableIterator(const BlockBasedTable* table,
                           const ReadOptions& read_options,
                           const InternalKeyComparator& icomp,
-                          InternalIteratorBase<BlockHandle>* index_iter,
+                          InternalIteratorBase<IndexValue>* index_iter,
                           bool check_filter, bool need_upper_bound_check,
-                          const SliceTransform* prefix_extractor, bool is_index,
-                          bool key_includes_seq = true,
-                          bool index_key_is_full = true,
-                          bool for_compaction = false)
+                          const SliceTransform* prefix_extractor,
+                          BlockType block_type, TableReaderCaller caller,
+                          size_t compaction_readahead_size = 0)
       : table_(table),
         read_options_(read_options),
         icomp_(icomp),
@@ -600,10 +630,9 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
         check_filter_(check_filter),
         need_upper_bound_check_(need_upper_bound_check),
         prefix_extractor_(prefix_extractor),
-        is_index_(is_index),
-        key_includes_seq_(key_includes_seq),
-        index_key_is_full_(index_key_is_full),
-        for_compaction_(for_compaction) {}
+        block_type_(block_type),
+        lookup_context_(caller),
+        compaction_readahead_size_(compaction_readahead_size) {}
 
   ~BlockBasedTableIterator() { delete index_iter_; }
 
@@ -612,22 +641,41 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   void SeekToFirst() override;
   void SeekToLast() override;
   void Next() final override;
-  bool NextAndGetResult(Slice* ret_key) override;
+  bool NextAndGetResult(IterateResult* result) override;
   void Prev() override;
   bool Valid() const override {
-    return !is_out_of_bound_ && block_iter_points_to_real_block_ &&
-           block_iter_.Valid();
+    return !is_out_of_bound_ &&
+           (is_at_first_key_from_index_ ||
+            (block_iter_points_to_real_block_ && block_iter_.Valid()));
   }
   Slice key() const override {
     assert(Valid());
-    return block_iter_.key();
+    if (is_at_first_key_from_index_) {
+      return index_iter_->value().first_internal_key;
+    } else {
+      return block_iter_.key();
+    }
   }
   Slice user_key() const override {
     assert(Valid());
-    return block_iter_.user_key();
+    if (is_at_first_key_from_index_) {
+      return ExtractUserKey(index_iter_->value().first_internal_key);
+    } else {
+      return block_iter_.user_key();
+    }
   }
   TValue value() const override {
     assert(Valid());
+
+    // Load current block if not loaded.
+    if (is_at_first_key_from_index_ &&
+        !const_cast<BlockBasedTableIterator*>(this)
+             ->MaterializeCurrentBlock()) {
+      // Oops, index is not consistent with block contents, but we have
+      // no good way to report error at this point. Let's return empty value.
+      return TValue();
+    }
+
     return block_iter_.value();
   }
   Status status() const override {
@@ -643,14 +691,26 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   // Whether iterator invalidated for being out of bound.
   bool IsOutOfBound() override { return is_out_of_bound_; }
 
+  inline bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return !data_block_within_upper_bound_;
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
   }
   bool IsKeyPinned() const override {
+    // Our key comes either from block_iter_'s current key
+    // or index_iter_'s current *value*.
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
-           block_iter_points_to_real_block_ && block_iter_.IsKeyPinned();
+           ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
+            (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
   }
   bool IsValuePinned() const override {
+    // Load current block if not loaded.
+    if (is_at_first_key_from_index_) {
+      const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock();
+    }
     // BlockIter::IsValuePinned() is always true. No need to check
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            block_iter_points_to_real_block_;
@@ -659,7 +719,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
   bool CheckPrefixMayMatch(const Slice& ikey) {
     if (check_filter_ &&
         !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
-                                need_upper_bound_check_)) {
+                                need_upper_bound_check_, &lookup_context_)) {
       // TODO remember the iterator is invalidated because of prefix
       // match. This can avoid the upper level file iterator to falsely
       // believe the position is the end of the SST file and move to
@@ -684,49 +744,60 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
     if (block_iter_points_to_real_block_) {
       // Reseek. If they end up with the same data block, we shouldn't re-fetch
       // the same data block.
-      prev_index_value_ = index_iter_->value();
+      prev_block_offset_ = index_iter_->value().handle.offset();
     }
   }
 
-  void InitDataBlock();
-  inline void FindKeyForward();
-  void FindBlockForward();
-  void FindKeyBackward();
-  void CheckOutOfBound();
-
  private:
-  BlockBasedTable* table_;
+  const BlockBasedTable* table_;
   const ReadOptions read_options_;
   const InternalKeyComparator& icomp_;
   UserComparatorWrapper user_comparator_;
-  InternalIteratorBase<BlockHandle>* index_iter_;
+  InternalIteratorBase<IndexValue>* index_iter_;
   PinnedIteratorsManager* pinned_iters_mgr_;
   TBlockIter block_iter_;
+
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
   bool block_iter_points_to_real_block_;
+  // See InternalIteratorBase::IsOutOfBound().
   bool is_out_of_bound_ = false;
+  // Whether current data block being fully within iterate upper bound.
+  bool data_block_within_upper_bound_ = false;
+  // True if we're standing at the first key of a block, and we haven't loaded
+  // that block yet. A call to value() will trigger loading the block.
+  bool is_at_first_key_from_index_ = false;
   bool check_filter_;
   // TODO(Zhongyi): pick a better name
   bool need_upper_bound_check_;
   const SliceTransform* prefix_extractor_;
-  // If the blocks over which we iterate are index blocks
-  bool is_index_;
-  // If the keys in the blocks over which we iterate include 8 byte sequence
-  bool key_includes_seq_;
-  bool index_key_is_full_;
-  // If this iterator is created for compaction
-  bool for_compaction_;
-  BlockHandle prev_index_value_;
-
-  // All the below fields control iterator readahead
-  static const size_t kInitAutoReadaheadSize = 8 * 1024;
-  // Found that 256 KB readahead size provides the best performance, based on
-  // experiments, for auto readahead. Experiment data is in PR #3282.
-  static const size_t kMaxAutoReadaheadSize;
-  static const int kMinNumFileReadsToStartAutoReadahead = 2;
-  size_t readahead_size_ = kInitAutoReadaheadSize;
+  BlockType block_type_;
+  uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
+  BlockCacheLookupContext lookup_context_;
+  // Readahead size used in compaction, its value is used only if
+  // lookup_context_.caller = kCompaction.
+  size_t compaction_readahead_size_;
+
+  size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
   size_t readahead_limit_ = 0;
   int64_t num_file_reads_ = 0;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
+
+  // If `target` is null, seek to first.
+  void SeekImpl(const Slice* target);
+
+  void InitDataBlock();
+  bool MaterializeCurrentBlock();
+  void FindKeyForward();
+  void FindBlockForward();
+  void FindKeyBackward();
+  void CheckOutOfBound();
+
+  // Check if data block is fully within iterate_upper_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update data_block_within_upper_bound_ accordingly.
+  void CheckDataBlockWithinUpperBound();
 };
 
 }  // namespace rocksdb
diff --git a/table/block_builder.cc b/table/block_based/block_builder.cc
similarity index 98%
rename from table/block_builder.cc
rename to table/block_based/block_builder.cc
index c14b4f6d3ee..a6a240c8e0a 100644
--- a/table/block_builder.cc
+++ b/table/block_based/block_builder.cc
@@ -31,13 +31,13 @@
 //     num_restarts: uint32
 // restarts[i] contains the offset within the block of the ith restart point.
 
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
 
 #include <assert.h>
 #include <algorithm>
 #include "db/dbformat.h"
 #include "rocksdb/comparator.h"
-#include "table/data_block_footer.h"
+#include "table/block_based/data_block_footer.h"
 #include "util/coding.h"
 
 namespace rocksdb {
diff --git a/table/block_builder.h b/table/block_based/block_builder.h
similarity index 98%
rename from table/block_builder.h
rename to table/block_based/block_builder.h
index 0576279f501..153e57569a2 100644
--- a/table/block_builder.h
+++ b/table/block_based/block_builder.h
@@ -13,7 +13,7 @@
 #include <stdint.h>
 #include "rocksdb/slice.h"
 #include "rocksdb/table.h"
-#include "table/data_block_hash_index.h"
+#include "table/block_based/data_block_hash_index.h"
 
 namespace rocksdb {
 
diff --git a/table/block_prefix_index.cc b/table/block_based/block_prefix_index.cc
similarity index 99%
rename from table/block_prefix_index.cc
rename to table/block_based/block_prefix_index.cc
index 67c749d4c3a..6e24f17cf68 100644
--- a/table/block_prefix_index.cc
+++ b/table/block_based/block_prefix_index.cc
@@ -3,14 +3,14 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "table/block_prefix_index.h"
+#include "table/block_based/block_prefix_index.h"
 
 #include <vector>
 
+#include "memory/arena.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
-#include "util/arena.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
diff --git a/table/block_prefix_index.h b/table/block_based/block_prefix_index.h
similarity index 100%
rename from table/block_prefix_index.h
rename to table/block_based/block_prefix_index.h
diff --git a/table/block_test.cc b/table/block_based/block_test.cc
similarity index 78%
rename from table/block_test.cc
rename to table/block_based/block_test.cc
index 3e0ff3eab59..38fa55089f1 100644
--- a/table/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
+
 #include <stdio.h>
 #include <algorithm>
 #include <set>
@@ -19,12 +20,12 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
-#include "table/block_builder.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -68,37 +69,12 @@ void GenerateRandomKVs(std::vector<std::string> *keys,
   }
 }
 
-// Same as GenerateRandomKVs but the values are BlockHandle
-void GenerateRandomKBHs(std::vector<std::string> *keys,
-                        std::vector<BlockHandle> *values, const int from,
-                        const int len, const int step = 1,
-                        const int padding_size = 0,
-                        const int keys_share_prefix = 1) {
-  Random rnd(302);
-  uint64_t offset = 0;
-
-  // generate different prefix
-  for (int i = from; i < from + len; i += step) {
-    // generate keys that shares the prefix
-    for (int j = 0; j < keys_share_prefix; ++j) {
-      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
-
-      uint64_t size = rnd.Uniform(1024 * 16);
-      BlockHandle handle(offset, size);
-      offset += size + kBlockTrailerSize;
-      values->emplace_back(handle);
-    }
-  }
-}
-
 class BlockTest : public testing::Test {};
 
 // block test
 TEST_F(BlockTest, SimpleTest) {
   Random rnd(301);
   Options options = Options();
-  std::unique_ptr<InternalKeyComparator> ic;
-  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
 
   std::vector<std::string> keys;
   std::vector<std::string> values;
@@ -122,7 +98,7 @@ TEST_F(BlockTest, SimpleTest) {
   // read contents of block sequentially
   int count = 0;
   InternalIterator *iter =
-      reader.NewIterator<DataBlockIter>(options.comparator, options.comparator);
+      reader.NewDataIterator(options.comparator, options.comparator);
   for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
     // read kv from block
     Slice k = iter->key();
@@ -135,8 +111,7 @@ TEST_F(BlockTest, SimpleTest) {
   delete iter;
 
   // read block contents randomly
-  iter =
-      reader.NewIterator<DataBlockIter>(options.comparator, options.comparator);
+  iter = reader.NewDataIterator(options.comparator, options.comparator);
   for (int i = 0; i < num_records; i++) {
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
@@ -151,83 +126,6 @@ TEST_F(BlockTest, SimpleTest) {
   delete iter;
 }
 
-TEST_F(BlockTest, ValueDeltaEncodingTest) {
-  Random rnd(301);
-  Options options = Options();
-  std::unique_ptr<InternalKeyComparator> ic;
-  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
-
-  std::vector<std::string> keys;
-  std::vector<BlockHandle> values;
-  const bool kUseDeltaEncoding = true;
-  const bool kUseValueDeltaEncoding = true;
-  BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding);
-  int num_records = 100;
-
-  GenerateRandomKBHs(&keys, &values, 0, num_records);
-  // add a bunch of records to a block
-  BlockHandle last_encoded_handle;
-  for (int i = 0; i < num_records; i++) {
-    auto block_handle = values[i];
-    std::string handle_encoding;
-    block_handle.EncodeTo(&handle_encoding);
-    std::string handle_delta_encoding;
-    PutVarsignedint64(&handle_delta_encoding,
-                      block_handle.size() - last_encoded_handle.size());
-    last_encoded_handle = block_handle;
-    const Slice handle_delta_encoding_slice(handle_delta_encoding);
-    builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice);
-  }
-
-  // read serialized contents of the block
-  Slice rawblock = builder.Finish();
-
-  // create block reader
-  BlockContents contents;
-  contents.data = rawblock;
-  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
-
-  const bool kTotalOrderSeek = true;
-  const bool kIncludesSeq = true;
-  const bool kValueIsFull = !kUseValueDeltaEncoding;
-  IndexBlockIter *kNullIter = nullptr;
-  Statistics *kNullStats = nullptr;
-  // read contents of block sequentially
-  int count = 0;
-  InternalIteratorBase<BlockHandle> *iter = reader.NewIterator<IndexBlockIter>(
-      options.comparator, options.comparator, kNullIter, kNullStats,
-      kTotalOrderSeek, kIncludesSeq, kValueIsFull);
-  for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
-    // read kv from block
-    Slice k = iter->key();
-    BlockHandle handle = iter->value();
-
-    // compare with lookaside array
-    ASSERT_EQ(k.ToString().compare(keys[count]), 0);
-
-    ASSERT_EQ(values[count].offset(), handle.offset());
-    ASSERT_EQ(values[count].size(), handle.size());
-  }
-  delete iter;
-
-  // read block contents randomly
-  iter = reader.NewIterator<IndexBlockIter>(
-      options.comparator, options.comparator, kNullIter, kNullStats,
-      kTotalOrderSeek, kIncludesSeq, kValueIsFull);
-  for (int i = 0; i < num_records; i++) {
-    // find a random key in the lookaside array
-    int index = rnd.Uniform(num_records);
-    Slice k(keys[index]);
-
-    // search in block for this key
-    iter->Seek(k);
-    ASSERT_TRUE(iter->Valid());
-    BlockHandle handle = iter->value();
-    ASSERT_EQ(values[index].offset(), handle.offset());
-    ASSERT_EQ(values[index].size(), handle.size());
-  }
-  delete iter;
-}
 // return the block contents
 BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
                                const std::vector<std::string> &keys,
@@ -260,8 +158,7 @@ void CheckBlockContents(BlockContents contents, const int max_key,
       NewFixedPrefixTransform(prefix_size));
 
   std::unique_ptr<InternalIterator> regular_iter(
-      reader2.NewIterator<DataBlockIter>(BytewiseComparator(),
-                                         BytewiseComparator()));
+      reader2.NewDataIterator(BytewiseComparator(), BytewiseComparator()));
 
   // Seek existent keys
   for (size_t i = 0; i < keys.size(); i++) {
@@ -456,8 +353,6 @@ TEST_F(BlockTest, BlockReadAmpBitmap) {
 TEST_F(BlockTest, BlockWithReadAmpBitmap) {
   Random rnd(301);
   Options options = Options();
-  std::unique_ptr<InternalKeyComparator> ic;
-  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
 
   std::vector<std::string> keys;
   std::vector<std::string> values;
@@ -485,9 +380,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
 
     // read contents of block sequentially
     size_t read_bytes = 0;
-    DataBlockIter *iter =
-        static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
-            options.comparator, options.comparator, nullptr, stats.get()));
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, options.comparator, nullptr, stats.get());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       iter->value();
       read_bytes += iter->TEST_CurrentEntrySize();
@@ -518,9 +412,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
                  kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    DataBlockIter *iter =
-        static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
-            options.comparator, options.comparator, nullptr, stats.get()));
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, options.comparator, nullptr, stats.get());
     for (int i = 0; i < num_records; i++) {
       Slice k(keys[i]);
 
@@ -554,9 +447,8 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
                  kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    DataBlockIter *iter =
-        static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
-            options.comparator, options.comparator, nullptr, stats.get()));
+    DataBlockIter *iter = reader.NewDataIterator(
+        options.comparator, options.comparator, nullptr, stats.get());
     std::unordered_set<int> read_keys;
     for (int i = 0; i < num_records; i++) {
       int index = rnd.Uniform(num_records);
@@ -586,21 +478,147 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
 
 TEST_F(BlockTest, ReadAmpBitmapPow2) {
   std::shared_ptr<Statistics> stats = rocksdb::CreateDBStatistics();
-  ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32);
-
-  ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32);
-  ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 1, stats.get()).GetBytesPerBit(), 1u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 2, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 4, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 8, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 16, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 32, stats.get()).GetBytesPerBit(), 32u);
+
+  ASSERT_EQ(BlockReadAmpBitmap(100, 3, stats.get()).GetBytesPerBit(), 2u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 7, stats.get()).GetBytesPerBit(), 4u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 11, stats.get()).GetBytesPerBit(), 8u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 17, stats.get()).GetBytesPerBit(), 16u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 33, stats.get()).GetBytesPerBit(), 32u);
+  ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u);
 }
 
+class IndexBlockTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  IndexBlockTest() = default;
+
+  bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); }
+  bool includeFirstKey() const { return std::get<1>(GetParam()); }
+};
+
+// Similar to GenerateRandomKVs but for index block contents.
+void GenerateRandomIndexEntries(std::vector<std::string> *separators,
+                                std::vector<BlockHandle> *block_handles,
+                                std::vector<std::string> *first_keys,
+                                const int len) {
+  Random rnd(42);
+
+  // For each of `len` blocks, we need to generate a first and last key.
+  // Let's generate n*2 random keys, sort them, group into consecutive pairs.
+  std::set<std::string> keys;
+  while ((int)keys.size() < len * 2) {
+    // Keys need to be at least 8 bytes long to look like internal keys.
+    keys.insert(test::RandomKey(&rnd, 12));
+  }
+
+  uint64_t offset = 0;
+  for (auto it = keys.begin(); it != keys.end();) {
+    first_keys->emplace_back(*it++);
+    separators->emplace_back(*it++);
+    uint64_t size = rnd.Uniform(1024 * 16);
+    BlockHandle handle(offset, size);
+    offset += size + kBlockTrailerSize;
+    block_handles->emplace_back(handle);
+  }
+}
+
+TEST_P(IndexBlockTest, IndexValueEncodingTest) {
+  Random rnd(301);
+  Options options = Options();
+
+  std::vector<std::string> separators;
+  std::vector<BlockHandle> block_handles;
+  std::vector<std::string> first_keys;
+  const bool kUseDeltaEncoding = true;
+  BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding());
+  int num_records = 100;
+
+  GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
+                             num_records);
+  BlockHandle last_encoded_handle;
+  for (int i = 0; i < num_records; i++) {
+    IndexValue entry(block_handles[i], first_keys[i]);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr);
+    if (useValueDeltaEncoding() && i > 0) {
+      entry.EncodeTo(&delta_encoded_entry, includeFirstKey(),
+                     &last_encoded_handle);
+    }
+    last_encoded_handle = entry.handle;
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  const bool kTotalOrderSeek = true;
+  const bool kIncludesSeq = true;
+  const bool kValueIsFull = !useValueDeltaEncoding();
+  IndexBlockIter *kNullIter = nullptr;
+  Statistics *kNullStats = nullptr;
+  // read contents of block sequentially
+  InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator(
+      options.comparator, options.comparator, kNullIter, kNullStats,
+      kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull);
+  iter->SeekToFirst();
+  for (int index = 0; index < num_records; ++index) {
+    ASSERT_TRUE(iter->Valid());
+
+    Slice k = iter->key();
+    IndexValue v = iter->value();
+
+    EXPECT_EQ(separators[index], k.ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+
+    iter->Next();
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewIndexIterator(options.comparator, options.comparator,
+                                 kNullIter, kNullStats, kTotalOrderSeek,
+                                 includeFirstKey(), kIncludesSeq, kValueIsFull);
+  for (int i = 0; i < num_records * 2; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(separators[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    IndexValue v = iter->value();
+    EXPECT_EQ(separators[index], iter->key().ToString());
+    EXPECT_EQ(block_handles[index].offset(), v.handle.offset());
+    EXPECT_EQ(block_handles[index].size(), v.handle.size());
+    EXPECT_EQ(includeFirstKey() ? first_keys[index] : "",
+              v.first_internal_key.ToString());
+  }
+  delete iter;
+}
+
+INSTANTIATE_TEST_CASE_P(P, IndexBlockTest,
+                        ::testing::Values(std::make_tuple(false, false),
+                                          std::make_tuple(false, true),
+                                          std::make_tuple(true, false),
+                                          std::make_tuple(true, true)));
+
 }  // namespace rocksdb
 
 int main(int argc, char **argv) {
diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h
new file mode 100644
index 00000000000..a60be2e6a70
--- /dev/null
+++ b/table/block_based/block_type.h
@@ -0,0 +1,30 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+namespace rocksdb {
+
+// Represents the types of blocks used in the block based table format.
+// See https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format
+// for details.
+
+enum class BlockType : uint8_t {
+  kData,
+  kFilter,
+  kProperties,
+  kCompressionDictionary,
+  kRangeDeletion,
+  kHashIndexPrefixes,
+  kHashIndexMetadata,
+  kMetaIndex,
+  kIndex,
+  // Note: keep kInvalid the last value when adding new enum values.
+  kInvalid
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h
new file mode 100644
index 00000000000..b4cd6ec6757
--- /dev/null
+++ b/table/block_based/cachable_entry.h
@@ -0,0 +1,220 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <cassert>
+#include "port/likely.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/cleanable.h"
+
+namespace rocksdb {
+
+// CachableEntry is a handle to an object that may or may not be in the block
+// cache. It is used in a variety of ways:
+//
+// 1) It may refer to an object in the block cache. In this case, cache_ and
+// cache_handle_ are not nullptr, and the cache handle has to be released when
+// the CachableEntry is destroyed (the lifecycle of the cached object, on the
+// other hand, is managed by the cache itself).
+// 2) It may uniquely own the (non-cached) object it refers to (examples include
+// a block read directly from file, or uncompressed blocks when there is a
+// compressed block cache but no uncompressed block cache). In such cases, the
+// object has to be destroyed when the CachableEntry is destroyed.
+// 3) It may point to an object (cached or not) without owning it. In this case,
+// no action is needed when the CachableEntry is destroyed.
+// 4) Sometimes, management of a cached or owned object (see #1 and #2 above)
+// is transferred to some other object. This is used for instance with iterators
+// (where cleanup is performed using a chain of cleanup functions,
+// see Cleanable).
+//
+// Because of #1 and #2 above, copying a CachableEntry is not safe (and thus not
+// allowed); hence, this is a move-only type, where a move transfers the
+// management responsibilities, and leaves the source object in an empty state.
+
+template <class T>
+class CachableEntry {
+public:
+  CachableEntry() = default;
+
+  CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle,
+    bool own_value)
+    : value_(value)
+    , cache_(cache)
+    , cache_handle_(cache_handle)
+    , own_value_(own_value)
+  {
+    assert(value_ != nullptr ||
+      (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+  }
+
+  CachableEntry(const CachableEntry&) = delete;
+  CachableEntry& operator=(const CachableEntry&) = delete;
+
+  CachableEntry(CachableEntry&& rhs)
+    : value_(rhs.value_)
+    , cache_(rhs.cache_)
+    , cache_handle_(rhs.cache_handle_)
+    , own_value_(rhs.own_value_)
+  {
+    assert(value_ != nullptr ||
+      (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+  }
+
+  CachableEntry& operator=(CachableEntry&& rhs) {
+    if (UNLIKELY(this == &rhs)) {
+      return *this;
+    }
+
+    ReleaseResource();
+
+    value_ = rhs.value_;
+    cache_ = rhs.cache_;
+    cache_handle_ = rhs.cache_handle_;
+    own_value_ = rhs.own_value_;
+
+    assert(value_ != nullptr ||
+      (cache_ == nullptr && cache_handle_ == nullptr && !own_value_));
+    assert(!!cache_ == !!cache_handle_);
+    assert(!cache_handle_ || !own_value_);
+
+    rhs.ResetFields();
+
+    return *this;
+  }
+
+  ~CachableEntry() {
+    ReleaseResource();
+  }
+
+  bool IsEmpty() const {
+    return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr &&
+      !own_value_;
+  }
+
+  bool IsCached() const {
+    assert(!!cache_ == !!cache_handle_);
+
+    return cache_handle_ != nullptr;
+  }
+
+  T* GetValue() const { return value_; }
+  Cache* GetCache() const { return cache_; }
+  Cache::Handle* GetCacheHandle() const { return cache_handle_; }
+  bool GetOwnValue() const { return own_value_; }
+
+  void Reset() {
+    ReleaseResource();
+    ResetFields();
+  }
+
+  void TransferTo(Cleanable* cleanable) {
+    if (cleanable) {
+      if (cache_handle_ != nullptr) {
+        assert(cache_ != nullptr);
+        cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, cache_handle_);
+      } else if (own_value_) {
+        cleanable->RegisterCleanup(&DeleteValue, value_, nullptr);
+      }
+    }
+
+    ResetFields();
+  }
+
+  void SetOwnedValue(T* value) {
+    assert(value != nullptr);
+
+    if (UNLIKELY(value_ == value && own_value_)) {
+      assert(cache_ == nullptr && cache_handle_ == nullptr);
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    own_value_ = true;
+  }
+
+  void SetUnownedValue(T* value) {
+    assert(value != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == nullptr &&
+                 cache_handle_ == nullptr && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    assert(!own_value_);
+  }
+
+  void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) {
+    assert(value != nullptr);
+    assert(cache != nullptr);
+    assert(cache_handle != nullptr);
+
+    if (UNLIKELY(value_ == value && cache_ == cache &&
+                 cache_handle_ == cache_handle && !own_value_)) {
+      return;
+    }
+
+    Reset();
+
+    value_ = value;
+    cache_ = cache;
+    cache_handle_ = cache_handle;
+    assert(!own_value_);
+  }
+
+private:
+  void ReleaseResource() {
+    if (LIKELY(cache_handle_ != nullptr)) {
+      assert(cache_ != nullptr);
+      cache_->Release(cache_handle_);
+    } else if (own_value_) {
+      delete value_;
+    }
+  }
+
+  void ResetFields() {
+    value_ = nullptr;
+    cache_ = nullptr;
+    cache_handle_ = nullptr;
+    own_value_ = false;
+  }
+
+  static void ReleaseCacheHandle(void* arg1, void* arg2) {
+    Cache* const cache = static_cast<Cache*>(arg1);
+    assert(cache);
+
+    Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
+    assert(cache_handle);
+
+    cache->Release(cache_handle);
+  }
+
+  static void DeleteValue(void* arg1, void* /* arg2 */) {
+    delete static_cast<T*>(arg1);
+  }
+
+private:
+  T* value_ = nullptr;
+  Cache* cache_ = nullptr;
+  Cache::Handle* cache_handle_ = nullptr;
+  bool own_value_ = false;
+};
+
+}  // namespace rocksdb
diff --git a/table/data_block_footer.cc b/table/block_based/data_block_footer.cc
similarity index 97%
rename from table/data_block_footer.cc
rename to table/block_based/data_block_footer.cc
index cb9e1438152..2cf31b4c5ef 100644
--- a/table/data_block_footer.cc
+++ b/table/block_based/data_block_footer.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "data_block_footer.h"
+#include "table/block_based/data_block_footer.h"
 
 #include "rocksdb/table.h"
 
diff --git a/table/data_block_footer.h b/table/block_based/data_block_footer.h
similarity index 100%
rename from table/data_block_footer.h
rename to table/block_based/data_block_footer.h
diff --git a/table/data_block_hash_index.cc b/table/block_based/data_block_hash_index.cc
similarity index 98%
rename from table/data_block_hash_index.cc
rename to table/block_based/data_block_hash_index.cc
index adb1d7b8c26..7737a9491ee 100644
--- a/table/data_block_hash_index.cc
+++ b/table/block_based/data_block_hash_index.cc
@@ -6,7 +6,7 @@
 #include <vector>
 
 #include "rocksdb/slice.h"
-#include "table/data_block_hash_index.h"
+#include "table/block_based/data_block_hash_index.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
diff --git a/table/data_block_hash_index.h b/table/block_based/data_block_hash_index.h
similarity index 100%
rename from table/data_block_hash_index.h
rename to table/block_based/data_block_hash_index.h
diff --git a/table/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc
similarity index 97%
rename from table/data_block_hash_index_test.cc
rename to table/block_based/data_block_hash_index_test.cc
index 11226648ef2..ae23f6ef2d3 100644
--- a/table/data_block_hash_index_test.cc
+++ b/table/block_based/data_block_hash_index_test.cc
@@ -9,14 +9,14 @@
 
 #include "db/table_properties_collector.h"
 #include "rocksdb/slice.h"
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
-#include "table/block_builder.h"
-#include "table/data_block_hash_index.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/data_block_hash_index.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
@@ -391,7 +391,7 @@ TEST(DataBlockHashIndex, BlockTestSingleKey) {
   Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
   const InternalKeyComparator icmp(BytewiseComparator());
-  auto iter = reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+  auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
   bool may_exist;
   // search in block for the key just inserted
   {
@@ -474,8 +474,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) {
 
   // random seek existent keys
   for (int i = 0; i < num_records; i++) {
-    auto iter =
-        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
     std::string ukey(keys[index] + "1" /* existing key marker */);
@@ -512,8 +511,7 @@ TEST(DataBlockHashIndex, BlockTestLarge) {
   //     C         true          false
 
   for (int i = 0; i < num_records; i++) {
-    auto iter =
-        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    auto iter = reader.NewDataIterator(&icmp, icmp.user_comparator());
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
     std::string ukey(keys[index] + "0" /* non-existing key marker */);
@@ -633,7 +631,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
     InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, seek_ukey, &value, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
 
     TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
     ASSERT_EQ(get_context.State(), GetContext::kFound);
@@ -658,7 +656,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
     InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, seek_ukey, &value, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
 
     TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
     ASSERT_EQ(get_context.State(), GetContext::kFound);
@@ -683,7 +681,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
     InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, seek_ukey, &value, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
 
     TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
     ASSERT_EQ(get_context.State(), GetContext::kFound);
@@ -708,7 +706,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
     InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, seek_ukey, &value, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
 
     TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
     ASSERT_EQ(get_context.State(), GetContext::kNotFound);
diff --git a/table/filter_block.h b/table/block_based/filter_block.h
similarity index 67%
rename from table/filter_block.h
rename to table/block_based/filter_block.h
index 8abb88e5f4f..90766b1140b 100644
--- a/table/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -24,12 +24,13 @@
 #include <string>
 #include <vector>
 #include "db/dbformat.h"
-#include "format.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "table/format.h"
 #include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
 #include "util/hash.h"
 
 namespace rocksdb {
@@ -37,6 +38,7 @@ namespace rocksdb {
 const uint64_t kNotValid = ULLONG_MAX;
 class FilterPolicy;
 
+class GetContext;
 using MultiGetRange = MultiGetContext::Range;
 
 // A FilterBlockBuilder is used to construct all of the filters for a
@@ -50,6 +52,10 @@ using MultiGetRange = MultiGetContext::Range;
 class FilterBlockBuilder {
  public:
   explicit FilterBlockBuilder() {}
+  // No copying allowed
+  FilterBlockBuilder(const FilterBlockBuilder&) = delete;
+  void operator=(const FilterBlockBuilder&) = delete;
+
   virtual ~FilterBlockBuilder() {}
 
   virtual bool IsBlockBased() = 0;                    // If is blockbased filter
@@ -64,11 +70,6 @@ class FilterBlockBuilder {
     return ret;
   }
   virtual Slice Finish(const BlockHandle& tmp, Status* status) = 0;
-
- private:
-  // No copying allowed
-  FilterBlockBuilder(const FilterBlockBuilder&);
-  void operator=(const FilterBlockBuilder&);
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
@@ -77,16 +78,14 @@ class FilterBlockBuilder {
 // BlockBased/Full FilterBlock would be called in the same way.
 class FilterBlockReader {
  public:
-  explicit FilterBlockReader()
-      : whole_key_filtering_(true), size_(0), statistics_(nullptr) {}
-  explicit FilterBlockReader(size_t s, Statistics* stats,
-                             bool _whole_key_filtering)
-      : whole_key_filtering_(_whole_key_filtering),
-        size_(s),
-        statistics_(stats) {}
-  virtual ~FilterBlockReader() {}
+  FilterBlockReader() = default;
+  virtual ~FilterBlockReader() = default;
+
+  FilterBlockReader(const FilterBlockReader&) = delete;
+  FilterBlockReader& operator=(const FilterBlockReader&) = delete;
 
   virtual bool IsBlockBased() = 0;  // If is blockbased filter
+
   /**
    * If no_io is set, then it returns true if it cannot answer the query without
    * reading data from disk. This is used in PartitionedFilterBlockReader to
@@ -99,18 +98,21 @@ class FilterBlockReader {
    */
   virtual bool KeyMayMatch(const Slice& key,
                            const SliceTransform* prefix_extractor,
-                           uint64_t block_offset = kNotValid,
-                           const bool no_io = false,
-                           const Slice* const const_ikey_ptr = nullptr) = 0;
+                           uint64_t block_offset, const bool no_io,
+                           const Slice* const const_ikey_ptr,
+                           GetContext* get_context,
+                           BlockCacheLookupContext* lookup_context) = 0;
 
   virtual void KeysMayMatch(MultiGetRange* range,
                             const SliceTransform* prefix_extractor,
-                            uint64_t block_offset = kNotValid,
-                            const bool no_io = false) {
+                            uint64_t block_offset, const bool no_io,
+                            BlockCacheLookupContext* lookup_context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
       const Slice ukey = iter->ukey;
       const Slice ikey = iter->ikey;
-      if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey)) {
+      GetContext* const get_context = iter->get_context;
+      if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey,
+                       get_context, lookup_context)) {
         range->SkipKey(iter);
       }
     }
@@ -121,29 +123,29 @@ class FilterBlockReader {
    */
   virtual bool PrefixMayMatch(const Slice& prefix,
                               const SliceTransform* prefix_extractor,
-                              uint64_t block_offset = kNotValid,
-                              const bool no_io = false,
-                              const Slice* const const_ikey_ptr = nullptr) = 0;
+                              uint64_t block_offset, const bool no_io,
+                              const Slice* const const_ikey_ptr,
+                              GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context) = 0;
 
   virtual void PrefixesMayMatch(MultiGetRange* range,
                                 const SliceTransform* prefix_extractor,
-                                uint64_t block_offset = kNotValid,
-                                const bool no_io = false) {
+                                uint64_t block_offset, const bool no_io,
+                                BlockCacheLookupContext* lookup_context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
       const Slice ukey = iter->ukey;
       const Slice ikey = iter->ikey;
-      if (!KeyMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
-                       block_offset, no_io, &ikey)) {
+      GetContext* const get_context = iter->get_context;
+      if (prefix_extractor->InDomain(ukey) &&
+          !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
+                          block_offset, no_io, &ikey, get_context,
+                          lookup_context)) {
         range->SkipKey(iter);
       }
     }
   }
 
   virtual size_t ApproximateMemoryUsage() const = 0;
-  virtual size_t size() const { return size_; }
-  virtual Statistics* statistics() const { return statistics_; }
-
-  bool whole_key_filtering() const { return whole_key_filtering_; }
 
   // convert this object to a human readable form
   virtual std::string ToString() const {
@@ -151,30 +153,22 @@ class FilterBlockReader {
     return error_msg;
   }
 
-  virtual void CacheDependencies(bool /*pin*/,
-                                 const SliceTransform* /*prefix_extractor*/) {}
+  virtual void CacheDependencies(bool /*pin*/) {}
 
-  virtual bool RangeMayExist(
-      const Slice* /*iterate_upper_bound*/, const Slice& user_key,
-      const SliceTransform* prefix_extractor,
-      const Comparator* /*comparator*/, const Slice* const const_ikey_ptr,
-      bool* filter_checked, bool /*need_upper_bound_check*/) {
+  virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/,
+                             const Slice& user_key,
+                             const SliceTransform* prefix_extractor,
+                             const Comparator* /*comparator*/,
+                             const Slice* const const_ikey_ptr,
+                             bool* filter_checked,
+                             bool /*need_upper_bound_check*/,
+                             BlockCacheLookupContext* lookup_context) {
     *filter_checked = true;
     Slice prefix = prefix_extractor->Transform(user_key);
     return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
-                          const_ikey_ptr);
+                          const_ikey_ptr, /* get_context */ nullptr,
+                          lookup_context);
   }
-
- protected:
-  bool whole_key_filtering_;
-
- private:
-  // No copying allowed
-  FilterBlockReader(const FilterBlockReader&);
-  void operator=(const FilterBlockReader&);
-  size_t size_;
-  Statistics* statistics_;
-  int level_ = -1;
 };
 
 }  // namespace rocksdb
diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc
new file mode 100644
index 00000000000..49a26882305
--- /dev/null
+++ b/table/block_based/filter_block_reader_common.cc
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/filter_block_reader_common.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/parsed_full_filter_block.h"
+
+namespace rocksdb {
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block) {
+  PERF_TIMER_GUARD(read_filter_block_nanos);
+
+  assert(table);
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+
+  const Status s =
+      table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle,
+                           UncompressionDict::GetEmptyDict(), filter_block,
+                           BlockType::kFilter, get_context, lookup_context,
+                           /* for_compaction */ false, use_cache);
+
+  return s;
+}
+
+template <typename TBlocklike>
+const SliceTransform*
+FilterBlockReaderCommon<TBlocklike>::table_prefix_extractor() const {
+  assert(table_);
+
+  const BlockBasedTable::Rep* const rep = table_->get_rep();
+  assert(rep);
+
+  return rep->prefix_filtering ? rep->table_prefix_extractor.get() : nullptr;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::whole_key_filtering() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->whole_key_filtering;
+}
+
+template <typename TBlocklike>
+bool FilterBlockReaderCommon<TBlocklike>::cache_filter_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+template <typename TBlocklike>
+Status FilterBlockReaderCommon<TBlocklike>::GetOrReadFilterBlock(
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<TBlocklike>* filter_block) const {
+  assert(filter_block);
+
+  if (!filter_block_.IsEmpty()) {
+    filter_block->SetUnownedValue(filter_block_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options,
+                         cache_filter_blocks(), get_context, lookup_context,
+                         filter_block);
+}
+
+template <typename TBlocklike>
+size_t FilterBlockReaderCommon<TBlocklike>::ApproximateFilterBlockMemoryUsage()
+    const {
+  assert(!filter_block_.GetOwnValue() || filter_block_.GetValue() != nullptr);
+  return filter_block_.GetOwnValue()
+             ? filter_block_.GetValue()->ApproximateMemoryUsage()
+             : 0;
+}
+
+// Explicitly instantiate templates for both "blocklike" types we use.
+// This makes it possible to keep the template definitions in the .cc file.
+template class FilterBlockReaderCommon<BlockContents>;
+template class FilterBlockReaderCommon<Block>;
+template class FilterBlockReaderCommon<ParsedFullFilterBlock>;
+
+}  // namespace rocksdb
diff --git a/table/block_based/filter_block_reader_common.h b/table/block_based/filter_block_reader_common.h
new file mode 100644
index 00000000000..4e691e0d913
--- /dev/null
+++ b/table/block_based/filter_block_reader_common.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/filter_block.h"
+
+namespace rocksdb {
+
+class BlockBasedTable;
+class FilePrefetchBuffer;
+
+// Encapsulates common functionality for the various filter block reader
+// implementations. Provides access to the filter block regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+template <typename TBlocklike>
+class FilterBlockReaderCommon : public FilterBlockReader {
+ public:
+  FilterBlockReaderCommon(const BlockBasedTable* t,
+                          CachableEntry<TBlocklike>&& filter_block)
+      : table_(t), filter_block_(std::move(filter_block)) {
+    assert(table_);
+  }
+
+ protected:
+  static Status ReadFilterBlock(const BlockBasedTable* table,
+                                FilePrefetchBuffer* prefetch_buffer,
+                                const ReadOptions& read_options, bool use_cache,
+                                GetContext* get_context,
+                                BlockCacheLookupContext* lookup_context,
+                                CachableEntry<TBlocklike>* filter_block);
+
+  const BlockBasedTable* table() const { return table_; }
+  const SliceTransform* table_prefix_extractor() const;
+  bool whole_key_filtering() const;
+  bool cache_filter_blocks() const;
+
+  Status GetOrReadFilterBlock(bool no_io, GetContext* get_context,
+                              BlockCacheLookupContext* lookup_context,
+                              CachableEntry<TBlocklike>* filter_block) const;
+
+  size_t ApproximateFilterBlockMemoryUsage() const;
+
+ private:
+  const BlockBasedTable* table_;
+  CachableEntry<TBlocklike> filter_block_;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
new file mode 100644
index 00000000000..38a6e9f83b2
--- /dev/null
+++ b/table/block_based/filter_policy.cc
@@ -0,0 +1,697 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <array>
+
+#include "rocksdb/filter_policy.h"
+
+#include "rocksdb/slice.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "third-party/folly/folly/ConstexprMath.h"
+#include "util/bloom_impl.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+namespace {
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit FastLocalBloomBitsBuilder(const int millibits_per_key)
+      : millibits_per_key_(millibits_per_key),
+        num_probes_(FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_)) {
+    assert(millibits_per_key >= 1000);
+  }
+
+  // No Copy allowed
+  FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
+  void operator=(const FastLocalBloomBitsBuilder&) = delete;
+
+  ~FastLocalBloomBitsBuilder() override {}
+
+  virtual void AddKey(const Slice& key) override {
+    uint64_t hash = GetSliceHash64(key);
+    if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+      hash_entries_.push_back(hash);
+    }
+  }
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t len_with_metadata =
+        CalculateSpace(static_cast<uint32_t>(hash_entries_.size()));
+    char* data = new char[len_with_metadata];
+    memset(data, 0, len_with_metadata);
+
+    assert(data);
+    assert(len_with_metadata >= 5);
+
+    uint32_t len = len_with_metadata - 5;
+    if (len > 0) {
+      AddAllEntries(data, len);
+    }
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -1 = Marker for newer Bloom implementations
+    data[len] = static_cast<char>(-1);
+    // 0 = Marker for this sub-implementation
+    data[len + 1] = static_cast<char>(0);
+    // num_probes (and 0 in upper bits for 64-byte block size)
+    data[len + 2] = static_cast<char>(num_probes_);
+    // rest of metadata stays zero
+
+    const char* const_data = data;
+    buf->reset(const_data);
+    hash_entries_.clear();
+
+    return Slice(data, len_with_metadata);
+  }
+
+  int CalculateNumEntry(const uint32_t bytes) override {
+    uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0;
+    return static_cast<int>(uint64_t{8000} * bytes_no_meta /
+                            millibits_per_key_);
+  }
+
+  uint32_t CalculateSpace(const int num_entry) override {
+    uint32_t num_cache_lines = 0;
+    if (millibits_per_key_ > 0 && num_entry > 0) {
+      num_cache_lines = static_cast<uint32_t>(
+          (int64_t{num_entry} * millibits_per_key_ + 511999) / 512000);
+    }
+    return num_cache_lines * 64 + /*metadata*/ 5;
+  }
+
+ private:
+  void AddAllEntries(char* data, uint32_t len) const {
+    // Simple version without prefetching:
+    //
+    // for (auto h : hash_entries_) {
+    //   FastLocalBloomImpl::AddHash(Lower32of64(h), Upper32of64(h), len,
+    //                               num_probes_, data);
+    // }
+
+    const size_t num_entries = hash_entries_.size();
+    constexpr size_t kBufferMask = 7;
+    static_assert(((kBufferMask + 1) & kBufferMask) == 0,
+                  "Must be power of 2 minus 1");
+
+    std::array<uint32_t, kBufferMask + 1> hashes;
+    std::array<uint32_t, kBufferMask + 1> byte_offsets;
+
+    // Prime the buffer
+    size_t i = 0;
+    for (; i <= kBufferMask && i < num_entries; ++i) {
+      uint64_t h = hash_entries_[i];
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+    }
+
+    // Process and buffer
+    for (; i < num_entries; ++i) {
+      uint32_t& hash_ref = hashes[i & kBufferMask];
+      uint32_t& byte_offset_ref = byte_offsets[i & kBufferMask];
+      // Process (add)
+      FastLocalBloomImpl::AddHashPrepared(hash_ref, num_probes_,
+                                          data + byte_offset_ref);
+      // And buffer
+      uint64_t h = hash_entries_[i];
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len, data,
+                                      /*out*/ &byte_offset_ref);
+      hash_ref = Upper32of64(h);
+    }
+
+    // Finish processing
+    for (i = 0; i <= kBufferMask && i < num_entries; ++i) {
+      FastLocalBloomImpl::AddHashPrepared(hashes[i], num_probes_,
+                                          data + byte_offsets[i]);
+    }
+  }
+
+  int millibits_per_key_;
+  int num_probes_;
+  std::vector<uint64_t> hash_entries_;
+};
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsReader : public FilterBitsReader {
+ public:
+  FastLocalBloomBitsReader(const char* data, int num_probes, uint32_t len_bytes)
+      : data_(data), num_probes_(num_probes), len_bytes_(len_bytes) {}
+
+  // No Copy allowed
+  FastLocalBloomBitsReader(const FastLocalBloomBitsReader&) = delete;
+  void operator=(const FastLocalBloomBitsReader&) = delete;
+
+  ~FastLocalBloomBitsReader() override {}
+
+  bool MayMatch(const Slice& key) override {
+    uint64_t h = GetSliceHash64(key);
+    uint32_t byte_offset;
+    FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                    /*out*/ &byte_offset);
+    return FastLocalBloomImpl::HashMayMatchPrepared(Upper32of64(h), num_probes_,
+                                                    data_ + byte_offset);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+    for (int i = 0; i < num_keys; ++i) {
+      uint64_t h = GetSliceHash64(*keys[i]);
+      FastLocalBloomImpl::PrepareHash(Lower32of64(h), len_bytes_, data_,
+                                      /*out*/ &byte_offsets[i]);
+      hashes[i] = Upper32of64(h);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = FastLocalBloomImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i]);
+    }
+  }
+
+ private:
+  const char* data_;
+  const int num_probes_;
+  const uint32_t len_bytes_;
+};
+
+using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
+
+class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+ public:
+  explicit LegacyBloomBitsBuilder(const int bits_per_key);
+
+  // No Copy allowed
+  LegacyBloomBitsBuilder(const LegacyBloomBitsBuilder&) = delete;
+  void operator=(const LegacyBloomBitsBuilder&) = delete;
+
+  ~LegacyBloomBitsBuilder() override;
+
+  void AddKey(const Slice& key) override;
+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override;
+
+  int CalculateNumEntry(const uint32_t bytes) override;
+
+  uint32_t CalculateSpace(const int num_entry) override {
+    uint32_t dont_care1;
+    uint32_t dont_care2;
+    return CalculateSpace(num_entry, &dont_care1, &dont_care2);
+  }
+
+ private:
+  int bits_per_key_;
+  int num_probes_;
+  std::vector<uint32_t> hash_entries_;
+
+  // Get totalbits that optimized for cpu cache line
+  uint32_t GetTotalBitsForLocality(uint32_t total_bits);
+
+  // Reserve space for new filter
+  char* ReserveSpace(const int num_entry, uint32_t* total_bits,
+                     uint32_t* num_lines);
+
+  // Implementation-specific variant of public CalculateSpace
+  uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
+                          uint32_t* num_lines);
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
+};
+
+LegacyBloomBitsBuilder::LegacyBloomBitsBuilder(const int bits_per_key)
+    : bits_per_key_(bits_per_key),
+      num_probes_(LegacyNoLocalityBloomImpl::ChooseNumProbes(bits_per_key_)) {
+  assert(bits_per_key_);
+}
+
+LegacyBloomBitsBuilder::~LegacyBloomBitsBuilder() {}
+
+void LegacyBloomBitsBuilder::AddKey(const Slice& key) {
+  uint32_t hash = BloomHash(key);
+  if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
+    hash_entries_.push_back(hash);
+  }
+}
+
+Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
+  uint32_t total_bits, num_lines;
+  char* data = ReserveSpace(static_cast<int>(hash_entries_.size()), &total_bits,
+                            &num_lines);
+  assert(data);
+
+  if (total_bits != 0 && num_lines != 0) {
+    for (auto h : hash_entries_) {
+      AddHash(h, data, num_lines, total_bits);
+    }
+  }
+  // See BloomFilterPolicy::GetFilterBitsReader for metadata
+  data[total_bits / 8] = static_cast<char>(num_probes_);
+  EncodeFixed32(data + total_bits / 8 + 1, static_cast<uint32_t>(num_lines));
+
+  const char* const_data = data;
+  buf->reset(const_data);
+  hash_entries_.clear();
+
+  return Slice(data, total_bits / 8 + 5);
+}
+
+uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_lines =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_lines an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_lines % 2 == 0) {
+    num_lines++;
+  }
+  return num_lines * (CACHE_LINE_SIZE * 8);
+}
+
+uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry,
+                                                uint32_t* total_bits,
+                                                uint32_t* num_lines) {
+  assert(bits_per_key_);
+  if (num_entry != 0) {
+    uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
+
+    *total_bits = GetTotalBitsForLocality(total_bits_tmp);
+    *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
+    assert(*total_bits > 0 && *total_bits % 8 == 0);
+  } else {
+    // filter is empty, just leave space for metadata
+    *total_bits = 0;
+    *num_lines = 0;
+  }
+
+  // Reserve space for Filter
+  uint32_t sz = *total_bits / 8;
+  sz += 5;  // 4 bytes for num_lines, 1 byte for num_probes
+  return sz;
+}
+
+char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry,
+                                           uint32_t* total_bits,
+                                           uint32_t* num_lines) {
+  uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
+  char* data = new char[sz];
+  memset(data, 0, sz);
+  return data;
+}
+
+int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
+  assert(bits_per_key_);
+  assert(bytes > 0);
+  int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1);
+  int low = 1;
+  int n = high;
+  for (; n >= low; n--) {
+    if (CalculateSpace(n) <= bytes) {
+      break;
+    }
+  }
+  assert(n < high);  // High should be an overestimation
+  return n;
+}
+
+inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
+                                            uint32_t num_lines,
+                                            uint32_t total_bits) {
+#ifdef NDEBUG
+  static_cast<void>(total_bits);
+#endif
+  assert(num_lines > 0 && total_bits > 0);
+
+  LegacyBloomImpl::AddHash(h, num_lines, num_probes_, data,
+                           folly::constexpr_log2(CACHE_LINE_SIZE));
+}
+
+class LegacyBloomBitsReader : public FilterBitsReader {
+ public:
+  LegacyBloomBitsReader(const char* data, int num_probes, uint32_t num_lines,
+                        uint32_t log2_cache_line_size)
+      : data_(data),
+        num_probes_(num_probes),
+        num_lines_(num_lines),
+        log2_cache_line_size_(log2_cache_line_size) {}
+
+  // No Copy allowed
+  LegacyBloomBitsReader(const LegacyBloomBitsReader&) = delete;
+  void operator=(const LegacyBloomBitsReader&) = delete;
+
+  ~LegacyBloomBitsReader() override {}
+
+  // "contents" contains the data built by a preceding call to
+  // FilterBitsBuilder::Finish. MayMatch must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
+  bool MayMatch(const Slice& key) override {
+    uint32_t hash = BloomHash(key);
+    uint32_t byte_offset;
+    LegacyBloomImpl::PrepareHashMayMatch(
+        hash, num_lines_, data_, /*out*/ &byte_offset, log2_cache_line_size_);
+    return LegacyBloomImpl::HashMayMatchPrepared(
+        hash, num_probes_, data_ + byte_offset, log2_cache_line_size_);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+    std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+    for (int i = 0; i < num_keys; ++i) {
+      hashes[i] = BloomHash(*keys[i]);
+      LegacyBloomImpl::PrepareHashMayMatch(hashes[i], num_lines_, data_,
+                                           /*out*/ &byte_offsets[i],
+                                           log2_cache_line_size_);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = LegacyBloomImpl::HashMayMatchPrepared(
+          hashes[i], num_probes_, data_ + byte_offsets[i],
+          log2_cache_line_size_);
+    }
+  }
+
+ private:
+  const char* data_;
+  const int num_probes_;
+  const uint32_t num_lines_;
+  const uint32_t log2_cache_line_size_;
+};
+
+class AlwaysTrueFilter : public FilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return true; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+};
+
+class AlwaysFalseFilter : public FilterBitsReader {
+ public:
+  bool MayMatch(const Slice&) override { return false; }
+  using FilterBitsReader::MayMatch;  // inherit overload
+};
+
+}  // namespace
+
+const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllFixedImpls = {
+    kLegacyBloom,
+    kDeprecatedBlock,
+    kFastLocalBloom,
+};
+
+const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = {
+    kDeprecatedBlock,
+    kAuto,
+};
+
+BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode)
+    : mode_(mode) {
+  // Sanitize bits_per_key
+  if (bits_per_key < 1.0) {
+    bits_per_key = 1.0;
+  } else if (!(bits_per_key < 100.0)) {  // including NaN
+    bits_per_key = 100.0;
+  }
+
+  // Includes a nudge toward rounding up, to ensure on all platforms
+  // that doubles specified with three decimal digits after the decimal
+  // point are interpreted accurately.
+  millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
+
+  // For better or worse, this is a rounding up of a nudged rounding up,
+  // e.g. 7.4999999999999 will round up to 8, but that provides more
+  // predictability against small arithmetic errors in floating point.
+  whole_bits_per_key_ = (millibits_per_key_ + 500) / 1000;
+}
+
+BloomFilterPolicy::~BloomFilterPolicy() {}
+
+const char* BloomFilterPolicy::Name() const {
+  return "rocksdb.BuiltinBloomFilter";
+}
+
+void BloomFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                     std::string* dst) const {
+  // We should ideally only be using this deprecated interface for
+  // appropriately constructed BloomFilterPolicy
+  assert(mode_ == kDeprecatedBlock);
+
+  // Compute bloom filter size (in both bits and bytes)
+  uint32_t bits = static_cast<uint32_t>(n * whole_bits_per_key_);
+
+  // For small n, we can see a very high false positive rate.  Fix it
+  // by enforcing a minimum bloom filter length.
+  if (bits < 64) bits = 64;
+
+  uint32_t bytes = (bits + 7) / 8;
+  bits = bytes * 8;
+
+  int num_probes =
+      LegacyNoLocalityBloomImpl::ChooseNumProbes(whole_bits_per_key_);
+
+  const size_t init_size = dst->size();
+  dst->resize(init_size + bytes, 0);
+  dst->push_back(static_cast<char>(num_probes));  // Remember # of probes
+  char* array = &(*dst)[init_size];
+  for (int i = 0; i < n; i++) {
+    LegacyNoLocalityBloomImpl::AddHash(BloomHash(keys[i]), bits, num_probes,
+                                       array);
+  }
+}
+
+bool BloomFilterPolicy::KeyMayMatch(const Slice& key,
+                                    const Slice& bloom_filter) const {
+  const size_t len = bloom_filter.size();
+  if (len < 2 || len > 0xffffffffU) {
+    return false;
+  }
+
+  const char* array = bloom_filter.data();
+  const uint32_t bits = static_cast<uint32_t>(len - 1) * 8;
+
+  // Use the encoded k so that we can read filters generated by
+  // bloom filters created using different parameters.
+  const int k = static_cast<uint8_t>(array[len - 1]);
+  if (k > 30) {
+    // Reserved for potentially new encodings for short bloom filters.
+    // Consider it a match.
+    return true;
+  }
+  // NB: using stored k not num_probes for whole_bits_per_key_
+  return LegacyNoLocalityBloomImpl::HashMayMatch(BloomHash(key), bits, k,
+                                                 array);
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetFilterBitsBuilder() const {
+  // This code path should no longer be used, for the built-in
+  // BloomFilterPolicy. Internal to RocksDB and outside
+  // BloomFilterPolicy, only get a FilterBitsBuilder with
+  // BloomFilterPolicy::GetBuilderFromContext(), which will call
+  // BloomFilterPolicy::GetBuilderWithContext(). RocksDB users have
+  // been warned (HISTORY.md) that they can no longer call this on
+  // the built-in BloomFilterPolicy (unlikely).
+  assert(false);
+  return GetBuilderWithContext(FilterBuildingContext(BlockBasedTableOptions()));
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
+    const FilterBuildingContext& context) const {
+  Mode cur = mode_;
+  // Unusual code construction so that we can have just
+  // one exhaustive switch without (risky) recursion
+  for (int i = 0; i < 2; ++i) {
+    switch (cur) {
+      case kAuto:
+        if (context.table_options.format_version < 5) {
+          cur = kLegacyBloom;
+        } else {
+          cur = kFastLocalBloom;
+        }
+        break;
+      case kDeprecatedBlock:
+        return nullptr;
+      case kFastLocalBloom:
+        return new FastLocalBloomBitsBuilder(millibits_per_key_);
+      case kLegacyBloom:
+        return new LegacyBloomBitsBuilder(whole_bits_per_key_);
+    }
+  }
+  assert(false);
+  return nullptr;  // something legal
+}
+
+FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext(
+    const FilterBuildingContext& context) {
+  if (context.table_options.filter_policy) {
+    return context.table_options.filter_policy->GetBuilderWithContext(context);
+  } else {
+    return nullptr;
+  }
+}
+
+// Read metadata to determine what kind of FilterBitsReader is needed
+// and return a new one.
+FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
+    const Slice& contents) const {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  if (len_with_meta <= 5) {
+    // filter is empty or broken. Treat like zero keys added.
+    return new AlwaysFalseFilter();
+  }
+
+  // Legacy Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | byte for num_probes or            |
+  //               |   marker for new implementations  |
+  //         len+1 +-----------------------------------+
+  //               | four bytes for number of cache    |
+  //               |   lines                           |
+  // len_with_meta +-----------------------------------+
+
+  int8_t raw_num_probes =
+      static_cast<int8_t>(contents.data()[len_with_meta - 5]);
+  // NB: *num_probes > 30 and < 128 probably have not been used, because of
+  // BloomFilterPolicy::initialize, unless directly calling
+  // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
+  // limbo with LegacyBloomBitsReader for now.
+
+  if (raw_num_probes < 1) {
+    // Note: < 0 (or unsigned > 127) indicate special new implementations
+    // (or reserved for future use)
+    if (raw_num_probes == -1) {
+      // Marker for newer Bloom implementations
+      return GetBloomBitsReader(contents);
+    }
+    // otherwise
+    // Treat as zero probes (always FP) for now.
+    return new AlwaysTrueFilter();
+  }
+  // else attempt decode for LegacyBloomBitsReader
+
+  int num_probes = raw_num_probes;
+  assert(num_probes >= 1);
+  assert(num_probes <= 127);
+
+  uint32_t len = len_with_meta - 5;
+  assert(len > 0);
+
+  uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
+  uint32_t log2_cache_line_size;
+
+  if (num_lines * CACHE_LINE_SIZE == len) {
+    // Common case
+    log2_cache_line_size = folly::constexpr_log2(CACHE_LINE_SIZE);
+  } else if (num_lines == 0 || len % num_lines != 0) {
+    // Invalid (no solution to num_lines * x == len)
+    // Treat as zero probes (always FP) for now.
+    return new AlwaysTrueFilter();
+  } else {
+    // Determine the non-native cache line size (from another system)
+    log2_cache_line_size = 0;
+    while ((num_lines << log2_cache_line_size) < len) {
+      ++log2_cache_line_size;
+    }
+    if ((num_lines << log2_cache_line_size) != len) {
+      // Invalid (block size not a power of two)
+      // Treat as zero probes (always FP) for now.
+      return new AlwaysTrueFilter();
+    }
+  }
+  // if not early return
+  return new LegacyBloomBitsReader(contents.data(), num_probes, num_lines,
+                                   log2_cache_line_size);
+}
+
+// For newer Bloom filter implementations
+FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader(
+    const Slice& contents) const {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  uint32_t len = len_with_meta - 5;
+
+  assert(len > 0);  // precondition
+
+  // New Bloom filter data:
+  //             0 +-----------------------------------+
+  //               | Raw Bloom filter data             |
+  //               | ...                               |
+  //           len +-----------------------------------+
+  //               | char{-1} byte -> new Bloom filter |
+  //         len+1 +-----------------------------------+
+  //               | byte for subimplementation        |
+  //               |   0: FastLocalBloom               |
+  //               |   other: reserved                 |
+  //         len+2 +-----------------------------------+
+  //               | byte for block_and_probes         |
+  //               |   0 in top 3 bits -> 6 -> 64-byte |
+  //               |   reserved:                       |
+  //               |   1 in top 3 bits -> 7 -> 128-byte|
+  //               |   2 in top 3 bits -> 8 -> 256-byte|
+  //               |   ...                             |
+  //               |   num_probes in bottom 5 bits,    |
+  //               |     except 0 and 31 reserved      |
+  //         len+3 +-----------------------------------+
+  //               | two bytes reserved                |
+  //               |   possibly for hash seed          |
+  // len_with_meta +-----------------------------------+
+
+  // Read more metadata (see above)
+  char sub_impl_val = contents.data()[len_with_meta - 4];
+  char block_and_probes = contents.data()[len_with_meta - 3];
+  int log2_block_bytes = ((block_and_probes >> 5) & 7) + 6;
+
+  int num_probes = (block_and_probes & 31);
+  if (num_probes < 1 || num_probes > 30) {
+    // Reserved / future safe
+    return new AlwaysTrueFilter();
+  }
+
+  uint16_t rest = DecodeFixed16(contents.data() + len_with_meta - 2);
+  if (rest != 0) {
+    // Reserved, possibly for hash seed
+    // Future safe
+    return new AlwaysTrueFilter();
+  }
+
+  if (sub_impl_val == 0) {        // FastLocalBloom
+    if (log2_block_bytes == 6) {  // Only block size supported for now
+      return new FastLocalBloomBitsReader(contents.data(), num_probes, len);
+    }
+  }
+  // otherwise
+  // Reserved / future safe
+  return new AlwaysTrueFilter();
+}
+
+const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
+                                         bool use_block_based_builder) {
+  BloomFilterPolicy::Mode m;
+  if (use_block_based_builder) {
+    m = BloomFilterPolicy::kDeprecatedBlock;
+  } else {
+    m = BloomFilterPolicy::kAuto;
+  }
+  assert(std::find(BloomFilterPolicy::kAllUserModes.begin(),
+                   BloomFilterPolicy::kAllUserModes.end(),
+                   m) != BloomFilterPolicy::kAllUserModes.end());
+  return new BloomFilterPolicy(bits_per_key, m);
+}
+
+FilterBuildingContext::FilterBuildingContext(
+    const BlockBasedTableOptions& _table_options)
+    : table_options(_table_options) {}
+
+FilterPolicy::~FilterPolicy() { }
+
+}  // namespace rocksdb
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
new file mode 100644
index 00000000000..b92980a546a
--- /dev/null
+++ b/table/block_based/filter_policy_internal.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+class Slice;
+
+// Exposes any extra information needed for testing built-in
+// FilterBitsBuilders
+class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  // Calculate number of bytes needed for a new filter, including
+  // metadata. Passing the result to CalculateNumEntry should
+  // return >= the num_entry passed in.
+  virtual uint32_t CalculateSpace(const int num_entry) = 0;
+};
+
+// RocksDB built-in filter policy for Bloom or Bloom-like filters.
+// This class is considered internal API and subject to change.
+// See NewBloomFilterPolicy.
+class BloomFilterPolicy : public FilterPolicy {
+ public:
+  // An internal marker for operating modes of BloomFilterPolicy, in terms
+  // of selecting an implementation. This makes it easier for tests to track
+  // or to walk over the built-in set of Bloom filter implementations. The
+  // only variance in BloomFilterPolicy by mode/implementation is in
+  // GetFilterBitsBuilder(), so an enum is practical here vs. subclasses.
+  //
+  // This enum is essentially the union of all the different kinds of return
+  // value from GetFilterBitsBuilder, or "underlying implementation", and
+  // higher-level modes that choose an underlying implementation based on
+  // context information.
+  enum Mode {
+    // Legacy implementation of Bloom filter for full and partitioned filters.
+    // Set to 0 in case of value confusion with bool use_block_based_builder
+    // NOTE: TESTING ONLY as this mode does not use best compatible
+    // implementation
+    kLegacyBloom = 0,
+    // Deprecated block-based Bloom filter implementation.
+    // Set to 1 in case of value confusion with bool use_block_based_builder
+    // NOTE: DEPRECATED but user exposed
+    kDeprecatedBlock = 1,
+    // A fast, cache-local Bloom filter implementation. See description in
+    // FastLocalBloomImpl.
+    // NOTE: TESTING ONLY as this mode does not check format_version
+    kFastLocalBloom = 2,
+    // Automatically choose from the above (except kDeprecatedBlock) based on
+    // context at build time, including compatibility with format_version.
+    // NOTE: This is currently the only recommended mode that is user exposed.
+    kAuto = 100,
+  };
+  // All the different underlying implementations that a BloomFilterPolicy
+  // might use, as a mode that says "always use this implementation."
+  // Only appropriate for unit tests.
+  static const std::vector<Mode> kAllFixedImpls;
+
+  // All the different modes of BloomFilterPolicy that are exposed from
+  // user APIs. Only appropriate for higher-level unit tests. Integration
+  // tests should prefer using NewBloomFilterPolicy (user-exposed).
+  static const std::vector<Mode> kAllUserModes;
+
+  explicit BloomFilterPolicy(double bits_per_key, Mode mode);
+
+  ~BloomFilterPolicy() override;
+
+  const char* Name() const override;
+
+  // Deprecated block-based filter only
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override;
+
+  // Deprecated block-based filter only
+  bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override;
+
+  FilterBitsBuilder* GetFilterBitsBuilder() const override;
+
+  // To use this function, call GetBuilderFromContext().
+  //
+  // Neither the context nor any objects therein should be saved beyond
+  // the call to this function, unless it's shared_ptr.
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override;
+
+  // Returns a new FilterBitsBuilder from the filter_policy in
+  // table_options of a context, or nullptr if not applicable.
+  // (An internal convenience function to save boilerplate.)
+  static FilterBitsBuilder* GetBuilderFromContext(const FilterBuildingContext&);
+
+  // Read metadata to determine what kind of FilterBitsReader is needed
+  // and return a new one. This must successfully process any filter data
+  // generated by a built-in FilterBitsBuilder, regardless of the impl
+  // chosen for this BloomFilterPolicy. Not compatible with CreateFilter.
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
+
+  // Essentially for testing only: configured millibits/key
+  int GetMillibitsPerKey() const { return millibits_per_key_; }
+  // Essentially for testing only: legacy whole bits/key
+  int GetWholeBitsPerKey() const { return whole_bits_per_key_; }
+
+ private:
+  // Newer filters support fractional bits per key. For predictable behavior
+  // of 0.001-precision values across floating point implementations, we
+  // round to thousandths of a bit (on average) per key.
+  int millibits_per_key_;
+
+  // Older filters round to whole number bits per key. (There *should* be no
+  // compatibility issue with fractional bits per key, but preserving old
+  // behavior with format_version < 5 just in case.)
+  int whole_bits_per_key_;
+
+  // Selected mode (a specific implementation or way of selecting an
+  // implementation) for building new SST filters.
+  Mode mode_;
+
+  // For newer Bloom filter implementation(s)
+  FilterBitsReader* GetBloomBitsReader(const Slice& contents) const;
+};
+
+}  // namespace rocksdb
diff --git a/table/flush_block_policy.cc b/table/block_based/flush_block_policy.cc
similarity index 98%
rename from table/flush_block_policy.cc
rename to table/block_based/flush_block_policy.cc
index 1b1675828da..31576848c07 100644
--- a/table/flush_block_policy.cc
+++ b/table/block_based/flush_block_policy.cc
@@ -6,7 +6,7 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
 
 #include <cassert>
diff --git a/table/flush_block_policy.h b/table/block_based/flush_block_policy.h
similarity index 100%
rename from table/flush_block_policy.h
rename to table/block_based/flush_block_policy.h
diff --git a/table/full_filter_block.cc b/table/block_based/full_filter_block.cc
similarity index 52%
rename from table/full_filter_block.cc
rename to table/block_based/full_filter_block.cc
index 9015e96d2ea..b3b2f58136b 100644
--- a/table/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -3,27 +3,22 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "table/full_filter_block.h"
-
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
+#include "table/block_based/full_filter_block.h"
+#include <array>
 
 #include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "util/coding.h"
 
 namespace rocksdb {
 
 FullFilterBlockBuilder::FullFilterBlockBuilder(
-    const SliceTransform* prefix_extractor, bool whole_key_filtering,
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
     FilterBitsBuilder* filter_bits_builder)
-    : prefix_extractor_(prefix_extractor),
+    : prefix_extractor_(_prefix_extractor),
       whole_key_filtering_(whole_key_filtering),
       last_whole_key_recorded_(false),
       last_prefix_recorded_(false),
@@ -62,7 +57,7 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
 }
 
 // Add prefix to filter if needed
-inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
   Slice prefix = prefix_extractor_->Transform(key);
   if (whole_key_filtering_) {
     // if both whole_key and prefix are added to bloom then we will have whole
@@ -98,57 +93,87 @@ Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
 }
 
 FullFilterBlockReader::FullFilterBlockReader(
-    const SliceTransform* prefix_extractor, bool _whole_key_filtering,
-    const Slice& contents, FilterBitsReader* filter_bits_reader,
-    Statistics* stats)
-    : FilterBlockReader(contents.size(), stats, _whole_key_filtering),
-      prefix_extractor_(prefix_extractor),
-      contents_(contents) {
-  assert(filter_bits_reader != nullptr);
-  filter_bits_reader_.reset(filter_bits_reader);
-  if (prefix_extractor_ != nullptr) {
+    const BlockBasedTable* t,
+    CachableEntry<ParsedFullFilterBlock>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {
+  const SliceTransform* const prefix_extractor = table_prefix_extractor();
+  if (prefix_extractor) {
     full_length_enabled_ =
-        prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_);
+        prefix_extractor->FullLengthEnabled(&prefix_extractor_full_length_);
   }
 }
 
-FullFilterBlockReader::FullFilterBlockReader(
-    const SliceTransform* prefix_extractor, bool _whole_key_filtering,
-    BlockContents&& contents, FilterBitsReader* filter_bits_reader,
-    Statistics* stats)
-    : FullFilterBlockReader(prefix_extractor, _whole_key_filtering,
-                            contents.data, filter_bits_reader, stats) {
-  block_contents_ = std::move(contents);
-}
-
 bool FullFilterBlockReader::KeyMayMatch(
     const Slice& key, const SliceTransform* /*prefix_extractor*/,
-    uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/) {
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
-  if (!whole_key_filtering_) {
+  if (!whole_key_filtering()) {
     return true;
   }
-  return MayMatch(key);
+  return MayMatch(key, no_io, get_context, lookup_context);
+}
+
+std::unique_ptr<FilterBlockReader> FullFilterBlockReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+                                     use_cache, nullptr /* get_context */,
+                                     lookup_context, &filter_block);
+    if (!s.ok()) {
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new FullFilterBlockReader(table, std::move(filter_block)));
 }
 
 bool FullFilterBlockReader::PrefixMayMatch(
     const Slice& prefix, const SliceTransform* /* prefix_extractor */,
-    uint64_t block_offset, const bool /*no_io*/,
-    const Slice* const /*const_ikey_ptr*/) {
+    uint64_t block_offset, const bool no_io,
+    const Slice* const /*const_ikey_ptr*/, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
-  return MayMatch(prefix);
+  return MayMatch(prefix, no_io, get_context, lookup_context);
 }
 
-bool FullFilterBlockReader::MayMatch(const Slice& entry) {
-  if (contents_.size() != 0)  {
-    if (filter_bits_reader_->MayMatch(entry)) {
+bool FullFilterBlockReader::MayMatch(
+    const Slice& entry, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context) const {
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+
+  const Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+  if (!s.ok()) {
+    return true;
+  }
+
+  assert(filter_block.GetValue());
+
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
+
+  if (filter_bits_reader) {
+    if (filter_bits_reader->MayMatch(entry)) {
       PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
       return true;
     } else {
@@ -161,33 +186,50 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) {
 
 void FullFilterBlockReader::KeysMayMatch(
     MultiGetRange* range, const SliceTransform* /*prefix_extractor*/,
-    uint64_t block_offset, const bool /*no_io*/) {
+    uint64_t block_offset, const bool no_io,
+    BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
   (void)range;
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
-  if (!whole_key_filtering_) {
+  if (!whole_key_filtering()) {
     // Simply return. Don't skip any key - consider all keys as likely to be
     // present
     return;
   }
-  MayMatch(range);
+  MayMatch(range, no_io, nullptr, lookup_context);
 }
 
 void FullFilterBlockReader::PrefixesMayMatch(
-    MultiGetRange* range, const SliceTransform* /* prefix_extractor */,
-    uint64_t block_offset, const bool /*no_io*/) {
+    MultiGetRange* range, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io,
+    BlockCacheLookupContext* lookup_context) {
 #ifdef NDEBUG
   (void)range;
   (void)block_offset;
 #endif
   assert(block_offset == kNotValid);
-  MayMatch(range);
+  MayMatch(range, no_io, prefix_extractor, lookup_context);
 }
 
-void FullFilterBlockReader::MayMatch(MultiGetRange* range) {
-  if (contents_.size() == 0) {
+void FullFilterBlockReader::MayMatch(
+    MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor,
+    BlockCacheLookupContext* lookup_context) const {
+  CachableEntry<ParsedFullFilterBlock> filter_block;
+
+  const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context,
+                                        lookup_context, &filter_block);
+  if (!s.ok()) {
+    return;
+  }
+
+  assert(filter_block.GetValue());
+
+  FilterBitsReader* const filter_bits_reader =
+      filter_block.GetValue()->filter_bits_reader();
+
+  if (!filter_bits_reader) {
     return;
   }
 
@@ -195,39 +237,55 @@ void FullFilterBlockReader::MayMatch(MultiGetRange* range) {
   // &may_match[0] doesn't work for autovector<bool> (compiler error). So
   // declare both keys and may_match as arrays, which is also slightly less
   // expensive compared to autovector
-  Slice* keys[MultiGetContext::MAX_BATCH_SIZE];
-  bool may_match[MultiGetContext::MAX_BATCH_SIZE] = {false};
+  std::array<Slice*, MultiGetContext::MAX_BATCH_SIZE> keys;
+  std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match = {{true}};
+  autovector<Slice, MultiGetContext::MAX_BATCH_SIZE> prefixes;
   int num_keys = 0;
-  for (auto iter = range->begin(); iter != range->end(); ++iter) {
-    keys[num_keys++] = &iter->ukey;
+  MultiGetRange filter_range(*range, range->begin(), range->end());
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
+    if (!prefix_extractor) {
+      keys[num_keys++] = &iter->ukey;
+    } else if (prefix_extractor->InDomain(iter->ukey)) {
+      prefixes.emplace_back(prefix_extractor->Transform(iter->ukey));
+      keys[num_keys++] = &prefixes.back();
+    } else {
+      filter_range.SkipKey(iter);
+    }
   }
-  filter_bits_reader_->MayMatch(num_keys, &keys[0], &may_match[0]);
+
+  filter_bits_reader->MayMatch(num_keys, &keys[0], &may_match[0]);
 
   int i = 0;
-  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+  for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
     if (!may_match[i]) {
+      // Update original MultiGet range to skip this key. The filter_range
+      // was temporarily used just to skip keys not in prefix_extractor domain
       range->SkipKey(iter);
+      PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+    } else {
+      // PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      PerfContext* perf_ctx = get_perf_context();
+      perf_ctx->bloom_sst_hit_count++;
     }
     ++i;
   }
 }
 
 size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
-  size_t usage = block_contents_.usable_size();
+  size_t usage = ApproximateFilterBlockMemoryUsage();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-  usage += malloc_usable_size((void*)this);
-  usage += malloc_usable_size(filter_bits_reader_.get());
+  usage += malloc_usable_size(const_cast<FullFilterBlockReader*>(this));
 #else
   usage += sizeof(*this);
-  usage += sizeof(*filter_bits_reader_.get());
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
   return usage;
 }
 
-bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound,
-    const Slice& user_key, const SliceTransform* prefix_extractor,
-    const Comparator* comparator, const Slice* const const_ikey_ptr,
-    bool* filter_checked, bool need_upper_bound_check) {
+bool FullFilterBlockReader::RangeMayExist(
+    const Slice* iterate_upper_bound, const Slice& user_key,
+    const SliceTransform* prefix_extractor, const Comparator* comparator,
+    const Slice* const const_ikey_ptr, bool* filter_checked,
+    bool need_upper_bound_check, BlockCacheLookupContext* lookup_context) {
   if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) {
     *filter_checked = false;
     return true;
@@ -240,22 +298,23 @@ bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound,
   } else {
     *filter_checked = true;
     return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
-                          const_ikey_ptr);
+                          const_ikey_ptr, /* get_context */ nullptr,
+                          lookup_context);
   }
 }
 
 bool FullFilterBlockReader::IsFilterCompatible(
     const Slice* iterate_upper_bound, const Slice& prefix,
-    const Comparator* comparator) {
+    const Comparator* comparator) const {
   // Try to reuse the bloom filter in the SST table if prefix_extractor in
   // mutable_cf_options has changed. If range [user_key, upper_bound) all
   // share the same prefix then we may still be able to use the bloom filter.
-  if (iterate_upper_bound != nullptr && prefix_extractor_) {
-    if (!prefix_extractor_->InDomain(*iterate_upper_bound)) {
+  const SliceTransform* const prefix_extractor = table_prefix_extractor();
+  if (iterate_upper_bound != nullptr && prefix_extractor) {
+    if (!prefix_extractor->InDomain(*iterate_upper_bound)) {
       return false;
     }
-    Slice upper_bound_xform =
-        prefix_extractor_->Transform(*iterate_upper_bound);
+    Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
     // first check if user_key and upper_bound all share the same prefix
     if (!comparator->Equal(prefix, upper_bound_xform)) {
       // second check if user_key's prefix is the immediate predecessor of
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
new file mode 100644
index 00000000000..04f1ec22849
--- /dev/null
+++ b/table/block_based/full_filter_block.h
@@ -0,0 +1,139 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+class FilterPolicy;
+class FilterBitsBuilder;
+class FilterBitsReader;
+
+// A FullFilterBlockBuilder is used to construct a full filter for a
+// particular Table.  It generates a single string which is stored as
+// a special block in the Table.
+// The format of full filter block is:
+// +----------------------------------------------------------------+
+// |              full filter for all keys in sst file              |
+// +----------------------------------------------------------------+
+// The full filter can be very large. At the end of it, we put
+// num_probes: how many hash functions are used in bloom filter
+//
+class FullFilterBlockBuilder : public FilterBlockBuilder {
+ public:
+  explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
+                                  bool whole_key_filtering,
+                                  FilterBitsBuilder* filter_bits_builder);
+  // No copying allowed
+  FullFilterBlockBuilder(const FullFilterBlockBuilder&) = delete;
+  void operator=(const FullFilterBlockBuilder&) = delete;
+
+  // bits_builder is created in filter_policy, it should be passed in here
+  // directly. and be deleted here
+  ~FullFilterBlockBuilder() {}
+
+  virtual bool IsBlockBased() override { return false; }
+  virtual void StartBlock(uint64_t /*block_offset*/) override {}
+  virtual void Add(const Slice& key) override;
+  virtual size_t NumAdded() const override { return num_added_; }
+  virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
+  using FilterBlockBuilder::Finish;
+
+ protected:
+  virtual void AddKey(const Slice& key);
+  std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
+  virtual void Reset();
+  void AddPrefix(const Slice& key);
+  const SliceTransform* prefix_extractor() { return prefix_extractor_; }
+
+ private:
+  // important: all of these might point to invalid addresses
+  // at the time of destruction of this filter block. destructor
+  // should NOT dereference them.
+  const SliceTransform* prefix_extractor_;
+  bool whole_key_filtering_;
+  bool last_whole_key_recorded_;
+  std::string last_whole_key_str_;
+  bool last_prefix_recorded_;
+  std::string last_prefix_str_;
+
+  uint32_t num_added_;
+  std::unique_ptr<const char[]> filter_data_;
+
+};
+
+// A FilterBlockReader is used to parse filter from SST table.
+// KeyMayMatch and PrefixMayMatch would trigger filter checking
+class FullFilterBlockReader
+    : public FilterBlockReaderCommon<ParsedFullFilterBlock> {
+ public:
+  FullFilterBlockReader(const BlockBasedTable* t,
+                        CachableEntry<ParsedFullFilterBlock>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
+
+  bool IsBlockBased() override { return false; }
+
+  bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+                   uint64_t block_offset, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context) override;
+
+  bool PrefixMayMatch(const Slice& prefix,
+                      const SliceTransform* prefix_extractor,
+                      uint64_t block_offset, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context) override;
+
+  void KeysMayMatch(MultiGetRange* range,
+                    const SliceTransform* prefix_extractor,
+                    uint64_t block_offset, const bool no_io,
+                    BlockCacheLookupContext* lookup_context) override;
+
+  void PrefixesMayMatch(MultiGetRange* range,
+                        const SliceTransform* prefix_extractor,
+                        uint64_t block_offset, const bool no_io,
+                        BlockCacheLookupContext* lookup_context) override;
+  size_t ApproximateMemoryUsage() const override;
+  bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
+                     const SliceTransform* prefix_extractor,
+                     const Comparator* comparator,
+                     const Slice* const const_ikey_ptr, bool* filter_checked,
+                     bool need_upper_bound_check,
+                     BlockCacheLookupContext* lookup_context) override;
+
+ private:
+  bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context,
+                BlockCacheLookupContext* lookup_context) const;
+  void MayMatch(MultiGetRange* range, bool no_io,
+                const SliceTransform* prefix_extractor,
+                BlockCacheLookupContext* lookup_context) const;
+  bool IsFilterCompatible(const Slice* iterate_upper_bound, const Slice& prefix,
+                          const Comparator* comparator) const;
+
+ private:
+  bool full_length_enabled_;
+  size_t prefix_extractor_full_length_;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
new file mode 100644
index 00000000000..bde19121c95
--- /dev/null
+++ b/table/block_based/full_filter_block_test.cc
@@ -0,0 +1,333 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <set>
+
+#include "table/block_based/full_filter_block.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+class TestFilterBitsBuilder : public FilterBitsBuilder {
+ public:
+  explicit TestFilterBitsBuilder() {}
+
+  // Add Key to filter
+  void AddKey(const Slice& key) override {
+    hash_entries_.push_back(Hash(key.data(), key.size(), 1));
+  }
+
+  // Generate the filter using the keys that are added
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4;
+    char* data = new char[len];
+    for (size_t i = 0; i < hash_entries_.size(); i++) {
+      EncodeFixed32(data + i * 4, hash_entries_[i]);
+    }
+    const char* const_data = data;
+    buf->reset(const_data);
+    return Slice(data, len);
+  }
+
+ private:
+  std::vector<uint32_t> hash_entries_;
+};
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class TestFilterBitsReader : public FilterBitsReader {
+ public:
+  explicit TestFilterBitsReader(const Slice& contents)
+      : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {}
+
+  // Silence compiler warning about overloaded virtual
+  using FilterBitsReader::MayMatch;
+  bool MayMatch(const Slice& entry) override {
+    uint32_t h = Hash(entry.data(), entry.size(), 1);
+    for (size_t i = 0; i + 4 <= len_; i += 4) {
+      if (h == DecodeFixed32(data_ + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  const char* data_;
+  uint32_t len_;
+};
+
+
+class TestHashFilter : public FilterPolicy {
+ public:
+  const char* Name() const override { return "TestHashFilter"; }
+
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+    for (int i = 0; i < n; i++) {
+      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
+      PutFixed32(dst, h);
+    }
+  }
+
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+    uint32_t h = Hash(key.data(), key.size(), 1);
+    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
+      if (h == DecodeFixed32(filter.data() + i)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  FilterBitsBuilder* GetFilterBitsBuilder() const override {
+    return new TestFilterBitsBuilder();
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    return new TestFilterBitsReader(contents);
+  }
+};
+
+class PluginFullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                                  public testing::Test {
+ public:
+  PluginFullFilterBlockTest()
+      : mock::MockBlockBasedTableTester(new TestHashFilter) {}
+};
+
+TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+}
+
+TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  Slice slice = builder.Finish();
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+class FullFilterBlockTest : public mock::MockBlockBasedTableTester,
+                            public testing::Test {
+ public:
+  FullFilterBlockTest()
+      : mock::MockBlockBasedTableTester(NewBloomFilterPolicy(10, false)) {}
+};
+
+TEST_F(FullFilterBlockTest, EmptyBuilder) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  Slice slice = builder.Finish();
+  ASSERT_EQ("", EscapeString(slice));
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  // Remain same symantic with blockbased filter
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+}
+
+class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder {
+  std::unique_ptr<FilterBitsBuilder> b_;
+  std::set<std::string> uniq_;
+
+ public:
+  explicit CountUniqueFilterBitsBuilderWrapper(FilterBitsBuilder* b) : b_(b) {}
+
+  ~CountUniqueFilterBitsBuilderWrapper() override {}
+
+  void AddKey(const Slice& key) override {
+    b_->AddKey(key);
+    uniq_.insert(key.ToString());
+  }
+
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    Slice rv = b_->Finish(buf);
+    uniq_.clear();
+    return rv;
+  }
+
+  int CalculateNumEntry(const uint32_t bytes) override {
+    return b_->CalculateNumEntry(bytes);
+  }
+
+  size_t CountUnique() { return uniq_.size(); }
+};
+
+TEST_F(FullFilterBlockTest, DuplicateEntries) {
+  {  // empty prefixes
+    std::unique_ptr<const SliceTransform> prefix_extractor(
+        NewFixedPrefixTransform(0));
+    auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+    const bool WHOLE_KEY = true;
+    FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                   bits_builder);
+    ASSERT_EQ(0, builder.NumAdded());
+    ASSERT_EQ(0, bits_builder->CountUnique());
+    // adds key and empty prefix; both abstractions count them
+    builder.Add("key1");
+    ASSERT_EQ(2, builder.NumAdded());
+    ASSERT_EQ(2, bits_builder->CountUnique());
+    // Add different key (unique) and also empty prefix (not unique).
+    // From here in this test, it's immaterial whether the block builder
+    // can count unique keys.
+    builder.Add("key2");
+    ASSERT_EQ(3, bits_builder->CountUnique());
+    // Empty key -> nothing unique
+    builder.Add("");
+    ASSERT_EQ(3, bits_builder->CountUnique());
+  }
+
+  // mix of empty and non-empty
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(7));
+  auto bits_builder = new CountUniqueFilterBitsBuilderWrapper(GetBuilder());
+  const bool WHOLE_KEY = true;
+  FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                 bits_builder);
+  ASSERT_EQ(0, builder.NumAdded());
+  builder.Add("");  // test with empty key too
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key2");
+  builder.Add("prefix1key3");
+  builder.Add("prefix2key4");
+  // 1 empty, 2 non-empty prefixes, and 4 non-empty keys
+  ASSERT_EQ(1 + 2 + 4, bits_builder->CountUnique());
+}
+
+TEST_F(FullFilterBlockTest, SingleChunk) {
+  FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
+  ASSERT_EQ(0, builder.NumAdded());
+  builder.Add("foo");
+  builder.Add("bar");
+  builder.Add("box");
+  builder.Add("box");
+  builder.Add("hello");
+  ASSERT_EQ(5, builder.NumAdded());
+  Slice slice = builder.Finish();
+
+  CachableEntry<ParsedFullFilterBlock> block(
+      new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                BlockContents(slice)),
+      nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+
+  FullFilterBlockReader reader(table_.get(), std::move(block));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", /*prefix_extractor=*/nullptr,
+                                 /*block_offset=*/kNotValid,
+                                 /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                                 /*get_context=*/nullptr,
+                                 /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "missing", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch(
+      "other", /*prefix_extractor=*/nullptr, /*block_offset=*/kNotValid,
+      /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr,
+      /*lookup_context=*/nullptr));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/table/index_builder.cc b/table/block_based/index_builder.cc
similarity index 93%
rename from table/index_builder.cc
rename to table/block_based/index_builder.cc
index 63cb80598fe..f3a4b10e01e 100644
--- a/table/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -7,17 +7,18 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "table/index_builder.h"
+#include "table/block_based/index_builder.h"
+
 #include <assert.h>
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <list>
 #include <string>
 
 #include "rocksdb/comparator.h"
 #include "rocksdb/flush_block_policy.h"
+#include "table/block_based/partitioned_filter_block.h"
 #include "table/format.h"
-#include "table/partitioned_filter_block.h"
 
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace rocksdb {
@@ -35,7 +36,7 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
       result = new ShortenedIndexBuilder(
           comparator, table_opt.index_block_restart_interval,
           table_opt.format_version, use_value_delta_encoding,
-          table_opt.index_shortening);
+          table_opt.index_shortening, /* include_first_key */ false);
     } break;
     case BlockBasedTableOptions::kHashSearch: {
       result = new HashIndexBuilder(
@@ -47,6 +48,12 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
       result = PartitionedIndexBuilder::CreateIndexBuilder(
           comparator, use_value_delta_encoding, table_opt);
     } break;
+    case BlockBasedTableOptions::kBinarySearchWithFirstKey: {
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding,
+          table_opt.index_shortening, /* include_first_key */ true);
+    } break;
     default: {
       assert(!"Do not recognize the index type ");
     } break;
@@ -93,7 +100,7 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
   sub_index_builder_ = new ShortenedIndexBuilder(
       comparator_, table_opt_.index_block_restart_interval,
       table_opt_.format_version, use_value_delta_encoding_,
-      table_opt_.index_shortening);
+      table_opt_.index_shortening, /* include_first_key */ false);
   flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
       table_opt_.metadata_block_size, table_opt_.block_size_deviation,
       // Note: this is sub-optimal since sub_index_builder_ could later reset
diff --git a/table/index_builder.h b/table/block_based/index_builder.h
similarity index 90%
rename from table/index_builder.h
rename to table/block_based/index_builder.h
index 2f349fc5471..47348b31f78 100644
--- a/table/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -10,15 +10,15 @@
 #pragma once
 
 #include <assert.h>
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <list>
 #include <string>
 #include <unordered_map>
 
 #include "rocksdb/comparator.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
 
 namespace rocksdb {
@@ -58,6 +58,7 @@ class IndexBuilder {
   // To allow further optimization, we provide `last_key_in_current_block` and
   // `first_key_in_next_block`, based on which the specific implementation can
   // determine the best index key to be used for the index block.
+  // Called before the OnKeyAdded() call for first_key_in_next_block.
   // @last_key_in_current_block: this parameter maybe overridden with the value
   //                             "substitute key".
   // @first_key_in_next_block: it will be nullptr if the entry being added is
@@ -123,7 +124,8 @@ class ShortenedIndexBuilder : public IndexBuilder {
       const InternalKeyComparator* comparator,
       const int index_block_restart_interval, const uint32_t format_version,
       const bool use_value_delta_encoding,
-      BlockBasedTableOptions::IndexShorteningMode shortening_mode)
+      BlockBasedTableOptions::IndexShorteningMode shortening_mode,
+      bool include_first_key)
       : IndexBuilder(comparator),
         index_block_builder_(index_block_restart_interval,
                              true /*use_delta_encoding*/,
@@ -131,11 +133,19 @@ class ShortenedIndexBuilder : public IndexBuilder {
         index_block_builder_without_seq_(index_block_restart_interval,
                                          true /*use_delta_encoding*/,
                                          use_value_delta_encoding),
+        use_value_delta_encoding_(use_value_delta_encoding),
+        include_first_key_(include_first_key),
         shortening_mode_(shortening_mode) {
     // Making the default true will disable the feature for old versions
     seperator_is_key_plus_seq_ = (format_version <= 2);
   }
 
+  virtual void OnKeyAdded(const Slice& key) override {
+    if (include_first_key_ && current_block_first_internal_key_.empty()) {
+      current_block_first_internal_key_.assign(key.data(), key.size());
+    }
+  }
+
   virtual void AddIndexEntry(std::string* last_key_in_current_block,
                              const Slice* first_key_in_next_block,
                              const BlockHandle& block_handle) override {
@@ -159,20 +169,27 @@ class ShortenedIndexBuilder : public IndexBuilder {
     }
     auto sep = Slice(*last_key_in_current_block);
 
-    std::string handle_encoding;
-    block_handle.EncodeTo(&handle_encoding);
-    std::string handle_delta_encoding;
-    PutVarsignedint64(&handle_delta_encoding,
-                      block_handle.size() - last_encoded_handle_.size());
-    assert(handle_delta_encoding.size() != 0);
+    assert(!include_first_key_ || !current_block_first_internal_key_.empty());
+    IndexValue entry(block_handle, current_block_first_internal_key_);
+    std::string encoded_entry;
+    std::string delta_encoded_entry;
+    entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
+    if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
+      entry.EncodeTo(&delta_encoded_entry, include_first_key_,
+                     &last_encoded_handle_);
+    } else {
+      // If it's the first block, or delta encoding is disabled,
+      // BlockBuilder::Add() below won't use delta-encoded slice.
+    }
     last_encoded_handle_ = block_handle;
-    const Slice handle_delta_encoding_slice(handle_delta_encoding);
-    index_block_builder_.Add(sep, handle_encoding,
-                             &handle_delta_encoding_slice);
+    const Slice delta_encoded_entry_slice(delta_encoded_entry);
+    index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice);
     if (!seperator_is_key_plus_seq_) {
-      index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding,
-                                           &handle_delta_encoding_slice);
+      index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry,
+                                           &delta_encoded_entry_slice);
     }
+
+    current_block_first_internal_key_.clear();
   }
 
   using IndexBuilder::Finish;
@@ -200,9 +217,12 @@ class ShortenedIndexBuilder : public IndexBuilder {
  private:
   BlockBuilder index_block_builder_;
   BlockBuilder index_block_builder_without_seq_;
+  const bool use_value_delta_encoding_;
   bool seperator_is_key_plus_seq_;
+  const bool include_first_key_;
   BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
-  BlockHandle last_encoded_handle_;
+  BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
+  std::string current_block_first_internal_key_;
 };
 
 // HashIndexBuilder contains a binary-searchable primary index and the
@@ -243,7 +263,7 @@ class HashIndexBuilder : public IndexBuilder {
       : IndexBuilder(comparator),
         primary_index_builder_(comparator, index_block_restart_interval,
                                format_version, use_value_delta_encoding,
-                               shortening_mode),
+                               shortening_mode, /* include_first_key */ false),
         hash_key_extractor_(hash_key_extractor) {}
 
   virtual void AddIndexEntry(std::string* last_key_in_current_block,
diff --git a/table/block_based/mock_block_based_table.h b/table/block_based/mock_block_based_table.h
new file mode 100644
index 00000000000..52891b1bdad
--- /dev/null
+++ b/table/block_based/mock_block_based_table.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block_based_filter_block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+
+namespace rocksdb {
+namespace mock {
+
+class MockBlockBasedTable : public BlockBasedTable {
+ public:
+  explicit MockBlockBasedTable(Rep* rep)
+      : BlockBasedTable(rep, nullptr /* block_cache_tracer */) {}
+};
+
+class MockBlockBasedTableTester {
+  static constexpr int kMockLevel = 0;
+
+ public:
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
+  BlockBasedTableOptions table_options_;
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+
+  MockBlockBasedTableTester(const FilterPolicy *filter_policy)
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator) {
+    table_options_.filter_policy.reset(filter_policy);
+
+    constexpr bool skip_filters = false;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep(
+        ioptions_, env_options_, table_options_, icomp_, skip_filters,
+        kMockLevel, immortal_table)));
+  }
+
+  FilterBitsBuilder* GetBuilder() const {
+    FilterBuildingContext context(table_options_);
+    context.column_family_name = "mock_cf";
+    context.compaction_style = ioptions_.compaction_style;
+    context.level_at_creation = kMockLevel;
+    return BloomFilterPolicy::GetBuilderFromContext(context);
+  }
+};
+
+}  // namespace mock
+}  // namespace rocksdb
diff --git a/table/block_based/parsed_full_filter_block.cc b/table/block_based/parsed_full_filter_block.cc
new file mode 100644
index 00000000000..5cc259d1906
--- /dev/null
+++ b/table/block_based/parsed_full_filter_block.cc
@@ -0,0 +1,22 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/parsed_full_filter_block.h"
+#include "rocksdb/filter_policy.h"
+
+namespace rocksdb {
+
+ParsedFullFilterBlock::ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                                             BlockContents&& contents)
+    : block_contents_(std::move(contents)),
+      filter_bits_reader_(
+          !block_contents_.data.empty()
+              ? filter_policy->GetFilterBitsReader(block_contents_.data)
+              : nullptr) {}
+
+ParsedFullFilterBlock::~ParsedFullFilterBlock() = default;
+
+}  // namespace rocksdb
diff --git a/table/block_based/parsed_full_filter_block.h b/table/block_based/parsed_full_filter_block.h
new file mode 100644
index 00000000000..8f15f935058
--- /dev/null
+++ b/table/block_based/parsed_full_filter_block.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "table/format.h"
+
+namespace rocksdb {
+
+class FilterBitsReader;
+class FilterPolicy;
+
+// The sharable/cachable part of the full filter.
+class ParsedFullFilterBlock {
+ public:
+  ParsedFullFilterBlock(const FilterPolicy* filter_policy,
+                        BlockContents&& contents);
+  ~ParsedFullFilterBlock();
+
+  FilterBitsReader* filter_bits_reader() const {
+    return filter_bits_reader_.get();
+  }
+
+  // TODO: consider memory usage of the FilterBitsReader
+  size_t ApproximateMemoryUsage() const {
+    return block_contents_.ApproximateMemoryUsage();
+  }
+
+  bool own_bytes() const { return block_contents_.own_bytes(); }
+
+ private:
+  BlockContents block_contents_;
+  std::unique_ptr<FilterBitsReader> filter_bits_reader_;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
new file mode 100644
index 00000000000..b9b96989fde
--- /dev/null
+++ b/table/block_based/partitioned_filter_block.cc
@@ -0,0 +1,390 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/partitioned_filter_block.h"
+
+#include <utility>
+
+#include "monitoring/perf_context_imp.h"
+#include "port/malloc.h"
+#include "port/port.h"
+#include "rocksdb/filter_policy.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
+    const SliceTransform* _prefix_extractor, bool whole_key_filtering,
+    FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+    const bool use_value_delta_encoding,
+    PartitionedIndexBuilder* const p_index_builder,
+    const uint32_t partition_size)
+    : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering,
+                             filter_bits_builder),
+      index_on_filter_block_builder_(index_block_restart_interval,
+                                     true /*use_delta_encoding*/,
+                                     use_value_delta_encoding),
+      index_on_filter_block_builder_without_seq_(index_block_restart_interval,
+                                                 true /*use_delta_encoding*/,
+                                                 use_value_delta_encoding),
+      p_index_builder_(p_index_builder),
+      filters_in_partition_(0),
+      num_added_(0) {
+  filters_per_partition_ =
+      filter_bits_builder_->CalculateNumEntry(partition_size);
+}
+
+PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
+
+void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
+    const Slice* next_key) {
+  // Use == to send the request only once
+  if (filters_in_partition_ == filters_per_partition_) {
+    // Currently only index builder is in charge of cutting a partition. We keep
+    // requesting until it is granted.
+    p_index_builder_->RequestPartitionCut();
+  }
+  if (!p_index_builder_->ShouldCutFilterBlock()) {
+    return;
+  }
+  filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));
+
+  // Add the prefix of the next key before finishing the partition. This hack,
+  // fixes a bug with format_verison=3 where seeking for the prefix would lead
+  // us to the previous partition.
+  const bool add_prefix =
+      next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
+  if (add_prefix) {
+    FullFilterBlockBuilder::AddPrefix(*next_key);
+  }
+
+  Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
+  std::string& index_key = p_index_builder_->GetPartitionKey();
+  filters.push_back({index_key, filter});
+  filters_in_partition_ = 0;
+  Reset();
+}
+
+void PartitionedFilterBlockBuilder::Add(const Slice& key) {
+  MaybeCutAFilterBlock(&key);
+  FullFilterBlockBuilder::Add(key);
+}
+
+void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
+  filter_bits_builder_->AddKey(key);
+  filters_in_partition_++;
+  num_added_++;
+}
+
+Slice PartitionedFilterBlockBuilder::Finish(
+    const BlockHandle& last_partition_block_handle, Status* status) {
+  if (finishing_filters == true) {
+    // Record the handle of the last written filter block in the index
+    FilterEntry& last_entry = filters.front();
+    std::string handle_encoding;
+    last_partition_block_handle.EncodeTo(&handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_on_filter_block_builder_.Add(last_entry.key, handle_encoding,
+                                       &handle_delta_encoding_slice);
+    if (!p_index_builder_->seperator_is_key_plus_seq()) {
+      index_on_filter_block_builder_without_seq_.Add(
+          ExtractUserKey(last_entry.key), handle_encoding,
+          &handle_delta_encoding_slice);
+    }
+    filters.pop_front();
+  } else {
+    MaybeCutAFilterBlock(nullptr);
+  }
+  // If there is no filter partition left, then return the index on filter
+  // partitions
+  if (UNLIKELY(filters.empty())) {
+    *status = Status::OK();
+    if (finishing_filters) {
+      if (p_index_builder_->seperator_is_key_plus_seq()) {
+        return index_on_filter_block_builder_.Finish();
+      } else {
+        return index_on_filter_block_builder_without_seq_.Finish();
+      }
+    } else {
+      // This is the rare case where no key was added to the filter
+      return Slice();
+    }
+  } else {
+    // Return the next filter partition in line and set Incomplete() status to
+    // indicate we expect more calls to Finish
+    *status = Status::Incomplete();
+    finishing_filters = true;
+    return filters.front().filter;
+  }
+}
+
+PartitionedFilterBlockReader::PartitionedFilterBlockReader(
+    const BlockBasedTable* t, CachableEntry<Block>&& filter_block)
+    : FilterBlockReaderCommon(t, std::move(filter_block)) {}
+
+std::unique_ptr<FilterBlockReader> PartitionedFilterBlockReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+
+  CachableEntry<Block> filter_block;
+  if (prefetch || !use_cache) {
+    const Status s = ReadFilterBlock(table, prefetch_buffer, ReadOptions(),
+                                     use_cache, nullptr /* get_context */,
+                                     lookup_context, &filter_block);
+    if (!s.ok()) {
+      return std::unique_ptr<FilterBlockReader>();
+    }
+
+    if (use_cache && !pin) {
+      filter_block.Reset();
+    }
+  }
+
+  return std::unique_ptr<FilterBlockReader>(
+      new PartitionedFilterBlockReader(table, std::move(filter_block)));
+}
+
+bool PartitionedFilterBlockReader::KeyMayMatch(
+    const Slice& key, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) {
+  assert(const_ikey_ptr != nullptr);
+  assert(block_offset == kNotValid);
+  if (!whole_key_filtering()) {
+    return true;
+  }
+
+  return MayMatch(key, prefix_extractor, block_offset, no_io, const_ikey_ptr,
+                  get_context, lookup_context,
+                  &FullFilterBlockReader::KeyMayMatch);
+}
+
+bool PartitionedFilterBlockReader::PrefixMayMatch(
+    const Slice& prefix, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io, const Slice* const const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context) {
+#ifdef NDEBUG
+  (void)block_offset;
+#endif
+  assert(const_ikey_ptr != nullptr);
+  assert(block_offset == kNotValid);
+  if (!table_prefix_extractor() && !prefix_extractor) {
+    return true;
+  }
+
+  return MayMatch(prefix, prefix_extractor, block_offset, no_io, const_ikey_ptr,
+                  get_context, lookup_context,
+                  &FullFilterBlockReader::PrefixMayMatch);
+}
+
+BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
+    const CachableEntry<Block>& filter_block, const Slice& entry) const {
+  IndexBlockIter iter;
+  const InternalKeyComparator* const comparator = internal_comparator();
+  Statistics* kNullStats = nullptr;
+  filter_block.GetValue()->NewIndexIterator(
+      comparator, comparator->user_comparator(), &iter, kNullStats,
+      true /* total_order_seek */, false /* have_first_key */,
+      index_key_includes_seq(), index_value_is_full());
+  iter.Seek(entry);
+  if (UNLIKELY(!iter.Valid())) {
+    // entry is larger than all the keys. However its prefix might still be
+    // present in the last partition. If this is called by PrefixMayMatch this
+    // is necessary for correct behavior. Otherwise it is unnecessary but safe.
+    // Assuming this is an unlikely case for full key search, the performance
+    // overhead should be negligible.
+    iter.SeekToLast();
+  }
+  assert(iter.Valid());
+  BlockHandle fltr_blk_handle = iter.value().handle;
+  return fltr_blk_handle;
+}
+
+Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
+    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle,
+    bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<ParsedFullFilterBlock>* filter_block) const {
+  assert(table());
+  assert(filter_block);
+  assert(filter_block->IsEmpty());
+
+  if (!filter_map_.empty()) {
+    auto iter = filter_map_.find(fltr_blk_handle.offset());
+    // This is a possible scenario since block cache might not have had space
+    // for the partition
+    if (iter != filter_map_.end()) {
+      filter_block->SetUnownedValue(iter->second.GetValue());
+      return Status::OK();
+    }
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  const Status s =
+      table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
+                             UncompressionDict::GetEmptyDict(), filter_block,
+                             BlockType::kFilter, get_context, lookup_context,
+                             /* for_compaction */ false, /* use_cache */ true);
+
+  return s;
+}
+
+bool PartitionedFilterBlockReader::MayMatch(
+    const Slice& slice, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    FilterFunction filter_function) const {
+  CachableEntry<Block> filter_block;
+  Status s =
+      GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block);
+  if (UNLIKELY(!s.ok())) {
+    return true;
+  }
+
+  if (UNLIKELY(filter_block.GetValue()->size() == 0)) {
+    return true;
+  }
+
+  auto filter_handle = GetFilterPartitionHandle(filter_block, *const_ikey_ptr);
+  if (UNLIKELY(filter_handle.size() == 0)) {  // key is out of range
+    return false;
+  }
+
+  CachableEntry<ParsedFullFilterBlock> filter_partition_block;
+  s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle,
+                              no_io, get_context, lookup_context,
+                              &filter_partition_block);
+  if (UNLIKELY(!s.ok())) {
+    return true;
+  }
+
+  FullFilterBlockReader filter_partition(table(),
+                                         std::move(filter_partition_block));
+  return (filter_partition.*filter_function)(
+      slice, prefix_extractor, block_offset, no_io, const_ikey_ptr, get_context,
+      lookup_context);
+}
+
+size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
+  size_t usage = ApproximateFilterBlockMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<PartitionedFilterBlockReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
+  // TODO(myabandeh): better estimation for filter_map_ size
+}
+
+// TODO(myabandeh): merge this with the same function in IndexReader
+void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
+  assert(table());
+
+  const BlockBasedTable::Rep* const rep = table()->get_rep();
+  assert(rep);
+
+  BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
+
+  CachableEntry<Block> filter_block;
+
+  Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
+                                  &lookup_context, &filter_block);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(rep->ioptions.info_log,
+                   "Error retrieving top-level filter block while trying to "
+                   "cache filter partitions: %s",
+                   s.ToString().c_str());
+    return;
+  }
+
+  // Before read partitions, prefetch them to avoid lots of IOs
+  assert(filter_block.GetValue());
+
+  IndexBlockIter biter;
+  const InternalKeyComparator* const comparator = internal_comparator();
+  Statistics* kNullStats = nullptr;
+  filter_block.GetValue()->NewIndexIterator(
+      comparator, comparator->user_comparator(), &biter, kNullStats,
+      true /* total_order_seek */, false /* have_first_key */,
+      index_key_includes_seq(), index_value_is_full());
+  // Index partitions are assumed to be consecuitive. Prefetch them all.
+  // Read the first block offset
+  biter.SeekToFirst();
+  BlockHandle handle = biter.value().handle;
+  uint64_t prefetch_off = handle.offset();
+
+  // Read the last block's offset
+  biter.SeekToLast();
+  handle = biter.value().handle;
+  uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
+  uint64_t prefetch_len = last_off - prefetch_off;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+
+  prefetch_buffer.reset(new FilePrefetchBuffer());
+  s = prefetch_buffer->Prefetch(rep->file.get(), prefetch_off,
+                                static_cast<size_t>(prefetch_len));
+
+  // After prefetch, read the partitions one by one
+  ReadOptions read_options;
+  for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
+    handle = biter.value().handle;
+
+    CachableEntry<ParsedFullFilterBlock> block;
+    // TODO: Support counter batch update for partitioned index and
+    // filter blocks
+    s = table()->MaybeReadBlockAndLoadToCache(
+        prefetch_buffer.get(), read_options, handle,
+        UncompressionDict::GetEmptyDict(), &block, BlockType::kFilter,
+        nullptr /* get_context */, &lookup_context, nullptr /* contents */);
+
+    assert(s.ok() || block.GetValue() == nullptr);
+    if (s.ok() && block.GetValue() != nullptr) {
+      if (block.IsCached()) {
+        if (pin) {
+          filter_map_[handle.offset()] = std::move(block);
+        }
+      }
+    }
+  }
+}
+
+const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
+    const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return &table()->get_rep()->internal_comparator;
+}
+
+bool PartitionedFilterBlockReader::index_key_includes_seq() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_key_includes_seq;
+}
+
+bool PartitionedFilterBlockReader::index_value_is_full() const {
+  assert(table());
+  assert(table()->get_rep());
+
+  return table()->get_rep()->index_value_is_full;
+}
+
+}  // namespace rocksdb
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
new file mode 100644
index 00000000000..089773d4751
--- /dev/null
+++ b/table/block_based/partitioned_filter_block.h
@@ -0,0 +1,126 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <string>
+#include <unordered_map>
+#include "db/dbformat.h"
+#include "index_builder.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block_reader_common.h"
+#include "table/block_based/full_filter_block.h"
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
+ public:
+  explicit PartitionedFilterBlockBuilder(
+      const SliceTransform* prefix_extractor, bool whole_key_filtering,
+      FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+      const bool use_value_delta_encoding,
+      PartitionedIndexBuilder* const p_index_builder,
+      const uint32_t partition_size);
+
+  virtual ~PartitionedFilterBlockBuilder();
+
+  void AddKey(const Slice& key) override;
+  void Add(const Slice& key) override;
+
+  size_t NumAdded() const override { return num_added_; }
+
+  virtual Slice Finish(const BlockHandle& last_partition_block_handle,
+                       Status* status) override;
+
+ private:
+  // Filter data
+  BlockBuilder index_on_filter_block_builder_;  // top-level index builder
+  BlockBuilder
+      index_on_filter_block_builder_without_seq_;  // same for user keys
+  struct FilterEntry {
+    std::string key;
+    Slice filter;
+  };
+  std::list<FilterEntry> filters;  // list of partitioned indexes and their keys
+  std::unique_ptr<IndexBuilder> value;
+  std::vector<std::unique_ptr<const char[]>> filter_gc;
+  bool finishing_filters =
+      false;  // true if Finish is called once but not complete yet.
+  // The policy of when cut a filter block and Finish it
+  void MaybeCutAFilterBlock(const Slice* next_key);
+  // Currently we keep the same number of partitions for filters and indexes.
+  // This would allow for some potentioal optimizations in future. If such
+  // optimizations did not realize we can use different number of partitions and
+  // eliminate p_index_builder_
+  PartitionedIndexBuilder* const p_index_builder_;
+  // The desired number of filters per partition
+  uint32_t filters_per_partition_;
+  // The current number of filters in the last partition
+  uint32_t filters_in_partition_;
+  // Number of keys added
+  size_t num_added_;
+  BlockHandle last_encoded_handle_;
+};
+
+class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
+ public:
+  PartitionedFilterBlockReader(const BlockBasedTable* t,
+                               CachableEntry<Block>&& filter_block);
+
+  static std::unique_ptr<FilterBlockReader> Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context);
+
+  bool IsBlockBased() override { return false; }
+  bool KeyMayMatch(const Slice& key, const SliceTransform* prefix_extractor,
+                   uint64_t block_offset, const bool no_io,
+                   const Slice* const const_ikey_ptr, GetContext* get_context,
+                   BlockCacheLookupContext* lookup_context) override;
+  bool PrefixMayMatch(const Slice& prefix,
+                      const SliceTransform* prefix_extractor,
+                      uint64_t block_offset, const bool no_io,
+                      const Slice* const const_ikey_ptr,
+                      GetContext* get_context,
+                      BlockCacheLookupContext* lookup_context) override;
+
+  size_t ApproximateMemoryUsage() const override;
+
+ private:
+  BlockHandle GetFilterPartitionHandle(const CachableEntry<Block>& filter_block,
+                                       const Slice& entry) const;
+  Status GetFilterPartitionBlock(
+      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle,
+      bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<ParsedFullFilterBlock>* filter_block) const;
+
+  using FilterFunction = bool (FullFilterBlockReader::*)(
+      const Slice& slice, const SliceTransform* prefix_extractor,
+      uint64_t block_offset, const bool no_io,
+      const Slice* const const_ikey_ptr, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context);
+  bool MayMatch(const Slice& slice, const SliceTransform* prefix_extractor,
+                uint64_t block_offset, bool no_io, const Slice* const_ikey_ptr,
+                GetContext* get_context,
+                BlockCacheLookupContext* lookup_context,
+                FilterFunction filter_function) const;
+  void CacheDependencies(bool pin) override;
+
+  const InternalKeyComparator* internal_comparator() const;
+  bool index_key_includes_seq() const;
+  bool index_value_is_full() const;
+
+ protected:
+  std::unordered_map<uint64_t, CachableEntry<ParsedFullFilterBlock>>
+      filter_map_;
+};
+
+}  // namespace rocksdb
diff --git a/table/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
similarity index 60%
rename from table/partitioned_filter_block_test.cc
rename to table/block_based/partitioned_filter_block_test.cc
index 8068f14d815..315789f55b7 100644
--- a/table/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -7,46 +7,50 @@
 
 #include "rocksdb/filter_policy.h"
 
-#include "table/full_filter_bits_builder.h"
-#include "table/index_builder.h"
-#include "table/partitioned_filter_block.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/partitioned_filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
+
+#include "index_builder.h"
+#include "logging/logging.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/hash.h"
-#include "util/logging.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
-std::map<uint64_t, Slice> slices;
+std::map<uint64_t, std::string> blooms;
 
 class MockedBlockBasedTable : public BlockBasedTable {
  public:
-  explicit MockedBlockBasedTable(Rep* rep) : BlockBasedTable(rep) {
+  MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib)
+      : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
     // Initialize what Open normally does as much as necessary for the test
-    rep->cache_key_prefix_size = 10;
-  }
-
-  CachableEntry<FilterBlockReader> GetFilter(
-      FilePrefetchBuffer*, const BlockHandle& filter_blk_handle,
-      const bool /* unused */, bool /* unused */, GetContext* /* unused */,
-      const SliceTransform* prefix_extractor) const override {
-    Slice slice = slices[filter_blk_handle.offset()];
-    auto obj = new FullFilterBlockReader(
-        prefix_extractor, true, BlockContents(slice),
-        rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr);
-    return {obj, nullptr};
+    rep->index_key_includes_seq = pib->seperator_is_key_plus_seq();
+    rep->index_value_is_full = !pib->get_use_value_delta_encoding();
   }
+};
 
-  FilterBlockReader* ReadFilter(
-      FilePrefetchBuffer*, const BlockHandle& filter_blk_handle,
-      const bool /* unused */,
-      const SliceTransform* prefix_extractor) const override {
-    Slice slice = slices[filter_blk_handle.offset()];
-    auto obj = new FullFilterBlockReader(
-        prefix_extractor, true, BlockContents(slice),
-        rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr);
-    return obj;
+class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader {
+ public:
+  MyPartitionedFilterBlockReader(BlockBasedTable* t,
+                                 CachableEntry<Block>&& filter_block)
+      : PartitionedFilterBlockReader(t, std::move(filter_block)) {
+    for (const auto& pair : blooms) {
+      const uint64_t offset = pair.first;
+      const std::string& bloom = pair.second;
+
+      assert(t);
+      assert(t->get_rep());
+      CachableEntry<ParsedFullFilterBlock> block(
+          new ParsedFullFilterBlock(
+              t->get_rep()->table_options.filter_policy.get(),
+              BlockContents(Slice(bloom))),
+          nullptr /* cache */, nullptr /* cache_handle */,
+          true /* own_value */);
+      filter_map_[offset] = std::move(block);
+    }
   }
 };
 
@@ -54,19 +58,26 @@ class PartitionedFilterBlockTest
     : public testing::Test,
       virtual public ::testing::WithParamInterface<uint32_t> {
  public:
+  Options options_;
+  ImmutableCFOptions ioptions_;
+  EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
-  InternalKeyComparator icomp = InternalKeyComparator(BytewiseComparator());
-
-  PartitionedFilterBlockTest() {
-    table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
-    table_options_.no_block_cache = true;  // Otherwise BlockBasedTable::Close
-                                           // will access variable that are not
-                                           // initialized in our mocked version
+  InternalKeyComparator icomp_;
+  std::unique_ptr<BlockBasedTable> table_;
+  std::shared_ptr<Cache> cache_;
+  int bits_per_key_;
+
+  PartitionedFilterBlockTest()
+      : ioptions_(options_),
+        env_options_(options_),
+        icomp_(options_.comparator),
+        bits_per_key_(10) {
+    table_options_.filter_policy.reset(
+        NewBloomFilterPolicy(bits_per_key_, false));
     table_options_.format_version = GetParam();
     table_options_.index_block_restart_interval = 3;
   }
 
-  std::shared_ptr<Cache> cache_;
   ~PartitionedFilterBlockTest() override {}
 
   const std::string keys[4] = {"afoo", "bar", "box", "hello"};
@@ -83,22 +94,15 @@ class PartitionedFilterBlockTest
   }
 
   uint64_t MaxFilterSize() {
-    uint32_t dont_care1, dont_care2;
     int num_keys = sizeof(keys) / sizeof(*keys);
-    auto filter_bits_reader = dynamic_cast<rocksdb::FullFilterBitsBuilder*>(
-        table_options_.filter_policy->GetFilterBitsBuilder());
-    assert(filter_bits_reader);
-    auto partition_size =
-        filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2);
-    delete filter_bits_reader;
-    return partition_size +
-               partition_size * table_options_.block_size_deviation / 100;
+    // General, rough over-approximation
+    return num_keys * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5);
   }
 
-  int last_offset = 10;
+  uint64_t last_offset = 10;
   BlockHandle Write(const Slice& slice) {
     BlockHandle bh(last_offset + 1, slice.size());
-    slices[bh.offset()] = slice;
+    blooms[bh.offset()] = slice.ToString();
     last_offset += bh.size();
     return bh;
   }
@@ -106,7 +110,7 @@ class PartitionedFilterBlockTest
   PartitionedIndexBuilder* NewIndexBuilder() {
     const bool kValueDeltaEncoded = true;
     return PartitionedIndexBuilder::CreateIndexBuilder(
-        &icomp, !kValueDeltaEncoded, table_options_);
+        &icomp_, !kValueDeltaEncoded, table_options_);
   }
 
   PartitionedFilterBlockBuilder* NewBuilder(
@@ -122,16 +126,14 @@ class PartitionedFilterBlockTest
     const bool kValueDeltaEncoded = true;
     return new PartitionedFilterBlockBuilder(
         prefix_extractor, table_options_.whole_key_filtering,
-        table_options_.filter_policy->GetFilterBitsBuilder(),
+        BloomFilterPolicy::GetBuilderFromContext(
+            FilterBuildingContext(table_options_)),
         table_options_.index_block_restart_interval, !kValueDeltaEncoded,
         p_index_builder, partition_size);
   }
 
-  std::unique_ptr<MockedBlockBasedTable> table;
-
   PartitionedFilterBlockReader* NewReader(
-      PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib,
-      const SliceTransform* prefix_extractor) {
+      PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib) {
     BlockHandle bh;
     Status status;
     Slice slice;
@@ -139,19 +141,21 @@ class PartitionedFilterBlockTest
       slice = builder->Finish(bh, &status);
       bh = Write(slice);
     } while (status.IsIncomplete());
-    const Options options;
-    const ImmutableCFOptions ioptions(options);
-    const MutableCFOptions moptions(options);
-    const EnvOptions env_options;
-    const bool kSkipFilters = true;
-    const bool kImmortal = true;
-    table.reset(new MockedBlockBasedTable(
-        new BlockBasedTable::Rep(ioptions, env_options, table_options_, icomp,
-                                 !kSkipFilters, 0, !kImmortal)));
-    auto reader = new PartitionedFilterBlockReader(
-        prefix_extractor, true, BlockContents(slice), nullptr, nullptr, icomp,
-        table.get(), pib->seperator_is_key_plus_seq(),
-        !pib->get_use_value_delta_encoding());
+
+    constexpr bool skip_filters = false;
+    constexpr int level = 0;
+    constexpr bool immortal_table = false;
+    table_.reset(new MockedBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
+                                 icomp_, skip_filters, level, immortal_table),
+        pib));
+    BlockContents contents(slice);
+    CachableEntry<Block> block(
+        new Block(std::move(contents), kDisableGlobalSequenceNumber,
+                  0 /* read_amp_bytes_per_bit */, nullptr),
+        nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */);
+    auto reader =
+        new MyPartitionedFilterBlockReader(table_.get(), std::move(block));
     return reader;
   }
 
@@ -159,33 +163,37 @@ class PartitionedFilterBlockTest
                     PartitionedIndexBuilder* pib, bool empty = false,
                     const SliceTransform* prefix_extractor = nullptr) {
     std::unique_ptr<PartitionedFilterBlockReader> reader(
-        NewReader(builder, pib, prefix_extractor));
+        NewReader(builder, pib));
     // Querying added keys
     const bool no_io = true;
     for (auto key : keys) {
       auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
       ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io,
-                                      &ikey_slice));
+                                      &ikey_slice, /*get_context=*/nullptr,
+                                      /*lookup_context=*/nullptr));
     }
     {
       // querying a key twice
       auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
-      ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid,
-                                      !no_io, &ikey_slice));
+      ASSERT_TRUE(reader->KeyMayMatch(
+          keys[0], prefix_extractor, kNotValid, !no_io, &ikey_slice,
+          /*get_context=*/nullptr, /*lookup_context=*/nullptr));
     }
     // querying missing keys
     for (auto key : missing_keys) {
       auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
       if (empty) {
-        ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid,
-                                        !no_io, &ikey_slice));
+        ASSERT_TRUE(reader->KeyMayMatch(
+            key, prefix_extractor, kNotValid, !no_io, &ikey_slice,
+            /*get_context=*/nullptr, /*lookup_context=*/nullptr));
       } else {
         // assuming a good hash function
-        ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid,
-                                         !no_io, &ikey_slice));
+        ASSERT_FALSE(reader->KeyMayMatch(
+            key, prefix_extractor, kNotValid, !no_io, &ikey_slice,
+            /*get_context=*/nullptr, /*lookup_context=*/nullptr));
       }
     }
   }
@@ -321,7 +329,7 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
   std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
   std::unique_ptr<PartitionedFilterBlockBuilder> builder(
       NewBuilder(pib.get(), prefix_extractor.get()));
-  const std::string pkeys[3] = {"p-key1", "p-key2", "p-key3"};
+  const std::string pkeys[3] = {"p-key10", "p-key20", "p-key30"};
   builder->Add(pkeys[0]);
   CutABlock(pib.get(), pkeys[0], pkeys[1]);
   builder->Add(pkeys[1]);
@@ -329,13 +337,62 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
   builder->Add(pkeys[2]);
   CutABlock(pib.get(), pkeys[2]);
   std::unique_ptr<PartitionedFilterBlockReader> reader(
-      NewReader(builder.get(), pib.get(), prefix_extractor.get()));
+      NewReader(builder.get(), pib.get()));
   for (auto key : pkeys) {
     auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
     const Slice ikey_slice = Slice(*ikey.rep());
-    ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key),
-                                       prefix_extractor.get(), kNotValid,
-                                       false /*no_io*/, &ikey_slice));
+    ASSERT_TRUE(reader->PrefixMayMatch(
+        prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid,
+        /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+        /*lookup_context=*/nullptr));
+  }
+  // Non-existent keys but with the same prefix
+  const std::string pnonkeys[4] = {"p-key9", "p-key11", "p-key21", "p-key31"};
+  for (auto key : pnonkeys) {
+    auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(
+        prefix_extractor->Transform(key), prefix_extractor.get(), kNotValid,
+        /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+        /*lookup_context=*/nullptr));
+  }
+}
+
+// This reproduces the bug in format_version=3 that the seeking the prefix will
+// lead us to the partition before the one that has filter for the prefix.
+TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) {
+  // some small number to cause partition cuts
+  table_options_.metadata_block_size = 1;
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      rocksdb::NewFixedPrefixTransform(2));
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+      NewBuilder(pib.get(), prefix_extractor.get()));
+  // In the bug, searching for prefix "p3" on an index with format version 3,
+  // will give the key "p3" and the partition of the keys that are <= p3, i.e.,
+  // p2-keys, where the filter for prefix "p3" does not exist.
+  const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3",
+                               "p5-key3"};
+  builder->Add(pkeys[0]);
+  CutABlock(pib.get(), pkeys[0], pkeys[1]);
+  builder->Add(pkeys[1]);
+  CutABlock(pib.get(), pkeys[1], pkeys[2]);
+  builder->Add(pkeys[2]);
+  CutABlock(pib.get(), pkeys[2], pkeys[3]);
+  builder->Add(pkeys[3]);
+  CutABlock(pib.get(), pkeys[3], pkeys[4]);
+  builder->Add(pkeys[4]);
+  CutABlock(pib.get(), pkeys[4]);
+  std::unique_ptr<PartitionedFilterBlockReader> reader(
+      NewReader(builder.get(), pib.get()));
+  for (auto key : pkeys) {
+    auto prefix = prefix_extractor->Transform(key);
+    auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(
+        prefix, prefix_extractor.get(), kNotValid,
+        /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr,
+        /*lookup_context=*/nullptr));
   }
 }
 
diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc
new file mode 100644
index 00000000000..10ceab16f22
--- /dev/null
+++ b/table/block_based/uncompression_dict_reader.cc
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "table/block_based/uncompression_dict_reader.h"
+#include "monitoring/perf_context_imp.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "util/compression.h"
+
+namespace rocksdb {
+
+Status UncompressionDictReader::Create(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    bool use_cache, bool prefetch, bool pin,
+    BlockCacheLookupContext* lookup_context,
+    std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader) {
+  assert(table);
+  assert(table->get_rep());
+  assert(!pin || prefetch);
+  assert(uncompression_dict_reader);
+
+  CachableEntry<UncompressionDict> uncompression_dict;
+  if (prefetch || !use_cache) {
+    const Status s = ReadUncompressionDictionary(
+        table, prefetch_buffer, ReadOptions(), use_cache,
+        nullptr /* get_context */, lookup_context, &uncompression_dict);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (use_cache && !pin) {
+      uncompression_dict.Reset();
+    }
+  }
+
+  uncompression_dict_reader->reset(
+      new UncompressionDictReader(table, std::move(uncompression_dict)));
+
+  return Status::OK();
+}
+
+Status UncompressionDictReader::ReadUncompressionDictionary(
+    const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+    const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<UncompressionDict>* uncompression_dict) {
+  // TODO: add perf counter for compression dictionary read time
+
+  assert(table);
+  assert(uncompression_dict);
+  assert(uncompression_dict->IsEmpty());
+
+  const BlockBasedTable::Rep* const rep = table->get_rep();
+  assert(rep);
+  assert(!rep->compression_dict_handle.IsNull());
+
+  const Status s = table->RetrieveBlock(
+      prefetch_buffer, read_options, rep->compression_dict_handle,
+      UncompressionDict::GetEmptyDict(), uncompression_dict,
+      BlockType::kCompressionDictionary, get_context, lookup_context,
+      /* for_compaction */ false, use_cache);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        rep->ioptions.info_log,
+        "Encountered error while reading data from compression dictionary "
+        "block %s",
+        s.ToString().c_str());
+  }
+
+  return s;
+}
+
+Status UncompressionDictReader::GetOrReadUncompressionDictionary(
+    FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context,
+    CachableEntry<UncompressionDict>* uncompression_dict) const {
+  assert(uncompression_dict);
+
+  if (!uncompression_dict_.IsEmpty()) {
+    uncompression_dict->SetUnownedValue(uncompression_dict_.GetValue());
+    return Status::OK();
+  }
+
+  ReadOptions read_options;
+  if (no_io) {
+    read_options.read_tier = kBlockCacheTier;
+  }
+
+  return ReadUncompressionDictionary(table_, prefetch_buffer, read_options,
+                                     cache_dictionary_blocks(), get_context,
+                                     lookup_context, uncompression_dict);
+}
+
+size_t UncompressionDictReader::ApproximateMemoryUsage() const {
+  assert(!uncompression_dict_.GetOwnValue() ||
+         uncompression_dict_.GetValue() != nullptr);
+  size_t usage = uncompression_dict_.GetOwnValue()
+                     ? uncompression_dict_.GetValue()->ApproximateMemoryUsage()
+                     : 0;
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<UncompressionDictReader*>(this));
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+
+  return usage;
+}
+
+bool UncompressionDictReader::cache_dictionary_blocks() const {
+  assert(table_);
+  assert(table_->get_rep());
+
+  return table_->get_rep()->table_options.cache_index_and_filter_blocks;
+}
+
+}  // namespace rocksdb
diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h
new file mode 100644
index 00000000000..bfaf0b4bc70
--- /dev/null
+++ b/table/block_based/uncompression_dict_reader.h
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include <cassert>
+#include "table/block_based/cachable_entry.h"
+#include "table/format.h"
+
+namespace rocksdb {
+
+class BlockBasedTable;
+struct BlockCacheLookupContext;
+class FilePrefetchBuffer;
+class GetContext;
+struct ReadOptions;
+struct UncompressionDict;
+
+// Provides access to the uncompression dictionary regardless of whether
+// it is owned by the reader or stored in the cache, or whether it is pinned
+// in the cache or not.
+class UncompressionDictReader {
+ public:
+  static Status Create(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      bool use_cache, bool prefetch, bool pin,
+      BlockCacheLookupContext* lookup_context,
+      std::unique_ptr<UncompressionDictReader>* uncompression_dict_reader);
+
+  Status GetOrReadUncompressionDictionary(
+      FilePrefetchBuffer* prefetch_buffer, bool no_io, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<UncompressionDict>* uncompression_dict) const;
+
+  size_t ApproximateMemoryUsage() const;
+
+ private:
+  UncompressionDictReader(const BlockBasedTable* t,
+                          CachableEntry<UncompressionDict>&& uncompression_dict)
+      : table_(t), uncompression_dict_(std::move(uncompression_dict)) {
+    assert(table_);
+  }
+
+  bool cache_dictionary_blocks() const;
+
+  static Status ReadUncompressionDictionary(
+      const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
+      const ReadOptions& read_options, bool use_cache, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context,
+      CachableEntry<UncompressionDict>* uncompression_dict);
+
+  const BlockBasedTable* table_;
+  CachableEntry<UncompressionDict> uncompression_dict_;
+};
+
+}  // namespace rocksdb
diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc
deleted file mode 100644
index 6b352b2f6b0..00000000000
--- a/table/block_based_filter_block_test.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "table/block_based_filter_block.h"
-
-#include "rocksdb/filter_policy.h"
-#include "util/coding.h"
-#include "util/hash.h"
-#include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace rocksdb {
-
-// For testing: emit an array with one hash value per key
-class TestHashFilter : public FilterPolicy {
- public:
-  const char* Name() const override { return "TestHashFilter"; }
-
-  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
-    for (int i = 0; i < n; i++) {
-      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
-      PutFixed32(dst, h);
-    }
-  }
-
-  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
-    uint32_t h = Hash(key.data(), key.size(), 1);
-    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
-      if (h == DecodeFixed32(filter.data() + i)) {
-        return true;
-      }
-    }
-    return false;
-  }
-};
-
-class FilterBlockTest : public testing::Test {
- public:
-  TestHashFilter policy_;
-  BlockBasedTableOptions table_options_;
-
-  FilterBlockTest() {
-    table_options_.filter_policy.reset(new TestHashFilter());
-  }
-};
-
-TEST_F(FilterBlockTest, EmptyBuilder) {
-  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
-  BlockContents block(builder.Finish());
-  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
-  BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
-                                     std::move(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0}));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100000));
-}
-
-TEST_F(FilterBlockTest, SingleChunk) {
-  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
-  ASSERT_EQ(0, builder.NumAdded());
-  builder.StartBlock(100);
-  builder.Add("foo");
-  builder.Add("bar");
-  builder.Add("box");
-  builder.StartBlock(200);
-  builder.Add("box");
-  builder.StartBlock(300);
-  builder.Add("hello");
-  ASSERT_EQ(5, builder.NumAdded());
-  BlockContents block(builder.Finish());
-  BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
-                                     std::move(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 100));
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 100));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 100));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100));
-  ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr, 100));
-  ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr, 100));
-}
-
-TEST_F(FilterBlockTest, MultiChunk) {
-  BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
-
-  // First filter
-  builder.StartBlock(0);
-  builder.Add("foo");
-  builder.StartBlock(2000);
-  builder.Add("bar");
-
-  // Second filter
-  builder.StartBlock(3100);
-  builder.Add("box");
-
-  // Third filter is empty
-
-  // Last filter
-  builder.StartBlock(9000);
-  builder.Add("box");
-  builder.Add("hello");
-
-  BlockContents block(builder.Finish());
-  BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
-                                     std::move(block), nullptr);
-
-  // Check first filter
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0}));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 2000));
-  ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, uint64_t{0}));
-  ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, uint64_t{0}));
-
-  // Check second filter
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 3100));
-  ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 3100));
-  ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 3100));
-  ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 3100));
-
-  // Check third filter (empty)
-  ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 4100));
-  ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 4100));
-  ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, 4100));
-  ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 4100));
-
-  // Check last filter
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 9000));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 9000));
-  ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 9000));
-  ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 9000));
-}
-
-// Test for block based filter block
-// use new interface in FilterPolicy to create filter builder/reader
-class BlockBasedFilterBlockTest : public testing::Test {
- public:
-  BlockBasedTableOptions table_options_;
-
-  BlockBasedFilterBlockTest() {
-    table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
-  }
-
-  ~BlockBasedFilterBlockTest() override {}
-};
-
-TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
-  FilterBlockBuilder* builder =
-      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
-  BlockContents block(builder->Finish());
-  ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
-  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
-      nullptr, table_options_, true, std::move(block), nullptr);
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0}));
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100000));
-
-  delete builder;
-  delete reader;
-}
-
-TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
-  FilterBlockBuilder* builder =
-      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
-  builder->StartBlock(100);
-  builder->Add("foo");
-  builder->Add("bar");
-  builder->Add("box");
-  builder->StartBlock(200);
-  builder->Add("box");
-  builder->StartBlock(300);
-  builder->Add("hello");
-  BlockContents block(builder->Finish());
-  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
-      nullptr, table_options_, true, std::move(block), nullptr);
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100));
-  ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 100));
-  ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 100));
-  ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 100));
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100));
-  ASSERT_TRUE(!reader->KeyMayMatch("missing", nullptr, 100));
-  ASSERT_TRUE(!reader->KeyMayMatch("other", nullptr, 100));
-
-  delete builder;
-  delete reader;
-}
-
-TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
-  FilterBlockBuilder* builder =
-      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
-
-  // First filter
-  builder->StartBlock(0);
-  builder->Add("foo");
-  builder->StartBlock(2000);
-  builder->Add("bar");
-
-  // Second filter
-  builder->StartBlock(3100);
-  builder->Add("box");
-
-  // Third filter is empty
-
-  // Last filter
-  builder->StartBlock(9000);
-  builder->Add("box");
-  builder->Add("hello");
-
-  BlockContents block(builder->Finish());
-  FilterBlockReader* reader = new BlockBasedFilterBlockReader(
-      nullptr, table_options_, true, std::move(block), nullptr);
-
-  // Check first filter
-  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0}));
-  ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 2000));
-  ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, uint64_t{0}));
-  ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, uint64_t{0}));
-
-  // Check second filter
-  ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 3100));
-  ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 3100));
-  ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 3100));
-  ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 3100));
-
-  // Check third filter (empty)
-  ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 4100));
-  ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 4100));
-  ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, 4100));
-  ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 4100));
-
-  // Check last filter
-  ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 9000));
-  ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 9000));
-  ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 9000));
-  ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 9000));
-
-  delete builder;
-  delete reader;
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc
deleted file mode 100644
index ad088337a19..00000000000
--- a/table/block_based_table_reader.cc
+++ /dev/null
@@ -1,3700 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "table/block_based_table_reader.h"
-
-#include <algorithm>
-#include <array>
-#include <limits>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "db/dbformat.h"
-#include "db/pinned_iterators_manager.h"
-
-#include "rocksdb/cache.h"
-#include "rocksdb/comparator.h"
-#include "rocksdb/env.h"
-#include "rocksdb/filter_policy.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/options.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/table.h"
-#include "rocksdb/table_properties.h"
-
-#include "table/block.h"
-#include "table/block_based_filter_block.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_fetcher.h"
-#include "table/block_prefix_index.h"
-#include "table/filter_block.h"
-#include "table/format.h"
-#include "table/full_filter_block.h"
-#include "table/get_context.h"
-#include "table/internal_iterator.h"
-#include "table/meta_blocks.h"
-#include "table/multiget_context.h"
-#include "table/partitioned_filter_block.h"
-#include "table/persistent_cache_helper.h"
-#include "table/sst_file_writer_collectors.h"
-#include "table/two_level_iterator.h"
-
-#include "monitoring/perf_context_imp.h"
-#include "util/coding.h"
-#include "util/crc32c.h"
-#include "util/file_reader_writer.h"
-#include "util/stop_watch.h"
-#include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/xxhash.h"
-
-namespace rocksdb {
-
-extern const uint64_t kBlockBasedTableMagicNumber;
-extern const std::string kHashIndexPrefixesBlock;
-extern const std::string kHashIndexPrefixesMetadataBlock;
-
-typedef BlockBasedTable::IndexReader IndexReader;
-
-BlockBasedTable::~BlockBasedTable() {
-  Close();
-  delete rep_;
-}
-
-std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
-
-namespace {
-// Read the block identified by "handle" from "file".
-// The only relevant option is options.verify_checksums for now.
-// On failure return non-OK.
-// On success fill *result and return OK - caller owns *result
-// @param uncompression_dict Data for presetting the compression library's
-//    dictionary.
-Status ReadBlockFromFile(
-    RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
-    const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
-    std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
-    bool do_uncompress, bool maybe_compressed,
-    const UncompressionDict& uncompression_dict,
-    const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
-    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) {
-  BlockContents contents;
-  BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle,
-                             &contents, ioptions, do_uncompress,
-                             maybe_compressed, uncompression_dict,
-                             cache_options, memory_allocator);
-  Status s = block_fetcher.ReadBlockContents();
-  if (s.ok()) {
-    result->reset(new Block(std::move(contents), global_seqno,
-                            read_amp_bytes_per_bit, ioptions.statistics));
-  }
-
-  return s;
-}
-
-inline MemoryAllocator* GetMemoryAllocator(
-    const BlockBasedTableOptions& table_options) {
-  return table_options.block_cache.get()
-             ? table_options.block_cache->memory_allocator()
-             : nullptr;
-}
-
-inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
-    const BlockBasedTableOptions& table_options) {
-  return table_options.block_cache_compressed.get()
-             ? table_options.block_cache_compressed->memory_allocator()
-             : nullptr;
-}
-
-// Delete the resource that is held by the iterator.
-template <class ResourceType>
-void DeleteHeldResource(void* arg, void* /*ignored*/) {
-  delete reinterpret_cast<ResourceType*>(arg);
-}
-
-// Delete the entry resided in the cache.
-template <class Entry>
-void DeleteCachedEntry(const Slice& /*key*/, void* value) {
-  auto entry = reinterpret_cast<Entry*>(value);
-  delete entry;
-}
-
-void DeleteCachedFilterEntry(const Slice& key, void* value);
-void DeleteCachedIndexEntry(const Slice& key, void* value);
-void DeleteCachedUncompressionDictEntry(const Slice& key, void* value);
-
-// Release the cached entry and decrement its ref count.
-void ReleaseCachedEntry(void* arg, void* h) {
-  Cache* cache = reinterpret_cast<Cache*>(arg);
-  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
-  cache->Release(handle);
-}
-
-// Release the cached entry and decrement its ref count.
-void ForceReleaseCachedEntry(void* arg, void* h) {
-  Cache* cache = reinterpret_cast<Cache*>(arg);
-  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
-  cache->Release(handle, true /* force_erase */);
-}
-
-Slice GetCacheKeyFromOffset(const char* cache_key_prefix,
-                            size_t cache_key_prefix_size, uint64_t offset,
-                            char* cache_key) {
-  assert(cache_key != nullptr);
-  assert(cache_key_prefix_size != 0);
-  assert(cache_key_prefix_size <= BlockBasedTable::kMaxCacheKeyPrefixSize);
-  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
-  char* end = EncodeVarint64(cache_key + cache_key_prefix_size, offset);
-  return Slice(cache_key, static_cast<size_t>(end - cache_key));
-}
-
-Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
-                                 int level, Tickers block_cache_miss_ticker,
-                                 Tickers block_cache_hit_ticker,
-                                 uint64_t* block_cache_miss_stats,
-                                 uint64_t* block_cache_hit_stats,
-                                 Statistics* statistics,
-                                 GetContext* get_context) {
-  auto cache_handle = block_cache->Lookup(key, statistics);
-  if (cache_handle != nullptr) {
-    PERF_COUNTER_ADD(block_cache_hit_count, 1);
-    PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
-                              static_cast<uint32_t>(level));
-    if (get_context != nullptr) {
-      // overall cache hit
-      get_context->get_context_stats_.num_cache_hit++;
-      // total bytes read from cache
-      get_context->get_context_stats_.num_cache_bytes_read +=
-          block_cache->GetUsage(cache_handle);
-      // block-type specific cache hit
-      (*block_cache_hit_stats)++;
-    } else {
-      // overall cache hit
-      RecordTick(statistics, BLOCK_CACHE_HIT);
-      // total bytes read from cache
-      RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
-                 block_cache->GetUsage(cache_handle));
-      RecordTick(statistics, block_cache_hit_ticker);
-    }
-  } else {
-    PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
-                              static_cast<uint32_t>(level));
-    if (get_context != nullptr) {
-      // overall cache miss
-      get_context->get_context_stats_.num_cache_miss++;
-      // block-type specific cache miss
-      (*block_cache_miss_stats)++;
-    } else {
-      RecordTick(statistics, BLOCK_CACHE_MISS);
-      RecordTick(statistics, block_cache_miss_ticker);
-    }
-  }
-
-  return cache_handle;
-}
-
-// For hash based index, return true if prefix_extractor and
-// prefix_extractor_block mismatch, false otherwise. This flag will be used
-// as total_order_seek via NewIndexIterator
-bool PrefixExtractorChanged(const TableProperties* table_properties,
-                            const SliceTransform* prefix_extractor) {
-  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
-  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
-  // is set but prefix_extractor_block is not set, also disable hash index
-  if (prefix_extractor == nullptr || table_properties == nullptr ||
-      table_properties->prefix_extractor_name.empty()) {
-    return true;
-  }
-
-  // prefix_extractor and prefix_extractor_block are both non-empty
-  if (table_properties->prefix_extractor_name.compare(
-          prefix_extractor->Name()) != 0) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-}  // namespace
-
-// Index that allows binary search lookup in a two-level index structure.
-class PartitionIndexReader : public IndexReader, public Cleanable {
- public:
-  // Read the partition index from the file and create an instance for
-  // `PartitionIndexReader`.
-  // On success, index_reader will be populated; otherwise it will remain
-  // unmodified.
-  static Status Create(BlockBasedTable* table, RandomAccessFileReader* file,
-                       FilePrefetchBuffer* prefetch_buffer,
-                       const Footer& footer, const BlockHandle& index_handle,
-                       const ImmutableCFOptions& ioptions,
-                       const InternalKeyComparator* icomparator,
-                       IndexReader** index_reader,
-                       const PersistentCacheOptions& cache_options,
-                       const int level, const bool index_key_includes_seq,
-                       const bool index_value_is_full,
-                       MemoryAllocator* memory_allocator) {
-    std::unique_ptr<Block> index_block;
-    auto s = ReadBlockFromFile(
-        file, prefetch_buffer, footer, ReadOptions(), index_handle,
-        &index_block, ioptions, true /* decompress */,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, kDisableGlobalSequenceNumber,
-        0 /* read_amp_bytes_per_bit */, memory_allocator);
-
-    if (s.ok()) {
-      *index_reader = new PartitionIndexReader(
-          table, icomparator, std::move(index_block), ioptions.statistics,
-          level, index_key_includes_seq, index_value_is_full);
-    }
-
-    return s;
-  }
-
-  // return a two-level iterator: first level is on the partition index
-  InternalIteratorBase<BlockHandle>* NewIterator(
-      IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true,
-      bool fill_cache = true) override {
-    Statistics* kNullStats = nullptr;
-    // Filters are already checked before seeking the index
-    if (!partition_map_.empty()) {
-      // We don't return pinned datat from index blocks, so no need
-      // to set `block_contents_pinned`.
-      return NewTwoLevelIterator(
-          new BlockBasedTable::PartitionedIndexIteratorState(
-              table_, &partition_map_, index_key_includes_seq_,
-              index_value_is_full_),
-          index_block_->NewIterator<IndexBlockIter>(
-              icomparator_, icomparator_->user_comparator(), nullptr,
-              kNullStats, true, index_key_includes_seq_, index_value_is_full_));
-    } else {
-      auto ro = ReadOptions();
-      ro.fill_cache = fill_cache;
-      bool kIsIndex = true;
-      // We don't return pinned datat from index blocks, so no need
-      // to set `block_contents_pinned`.
-      return new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
-          table_, ro, *icomparator_,
-          index_block_->NewIterator<IndexBlockIter>(
-              icomparator_, icomparator_->user_comparator(), nullptr,
-              kNullStats, true, index_key_includes_seq_, index_value_is_full_),
-          false, true, /* prefix_extractor */ nullptr, kIsIndex,
-          index_key_includes_seq_, index_value_is_full_);
-    }
-    // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
-    // on-stack BlockIter while the state is on heap. Currentlly it assumes
-    // the first level iter is always on heap and will attempt to delete it
-    // in its destructor.
-  }
-
-  void CacheDependencies(bool pin) override {
-    // Before read partitions, prefetch them to avoid lots of IOs
-    auto rep = table_->rep_;
-    IndexBlockIter biter;
-    BlockHandle handle;
-    Statistics* kNullStats = nullptr;
-    // We don't return pinned datat from index blocks, so no need
-    // to set `block_contents_pinned`.
-    index_block_->NewIterator<IndexBlockIter>(
-        icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true,
-        index_key_includes_seq_, index_value_is_full_);
-    // Index partitions are assumed to be consecuitive. Prefetch them all.
-    // Read the first block offset
-    biter.SeekToFirst();
-    if (!biter.Valid()) {
-      // Empty index.
-      return;
-    }
-    handle = biter.value();
-    uint64_t prefetch_off = handle.offset();
-
-    // Read the last block's offset
-    biter.SeekToLast();
-    if (!biter.Valid()) {
-      // Empty index.
-      return;
-    }
-    handle = biter.value();
-    uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
-    uint64_t prefetch_len = last_off - prefetch_off;
-    std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
-    auto& file = table_->rep_->file;
-    prefetch_buffer.reset(new FilePrefetchBuffer());
-    Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
-                                         static_cast<size_t>(prefetch_len));
-
-    // After prefetch, read the partitions one by one
-    biter.SeekToFirst();
-    auto ro = ReadOptions();
-    Cache* block_cache = rep->table_options.block_cache.get();
-    for (; biter.Valid(); biter.Next()) {
-      handle = biter.value();
-      BlockBasedTable::CachableEntry<Block> block;
-      const bool is_index = true;
-      // TODO: Support counter batch update for partitioned index and
-      // filter blocks
-      s = table_->MaybeReadBlockAndLoadToCache(
-          prefetch_buffer.get(), rep, ro, handle,
-          UncompressionDict::GetEmptyDict(), &block, is_index,
-          nullptr /* get_context */);
-
-      assert(s.ok() || block.value == nullptr);
-      if (s.ok() && block.value != nullptr) {
-        if (block.cache_handle != nullptr) {
-          if (pin) {
-            partition_map_[handle.offset()] = block;
-            RegisterCleanup(&ReleaseCachedEntry, block_cache,
-                            block.cache_handle);
-          } else {
-            block_cache->Release(block.cache_handle);
-          }
-        } else {
-          delete block.value;
-        }
-      }
-    }
-  }
-
-  size_t size() const override { return index_block_->size(); }
-  size_t usable_size() const override { return index_block_->usable_size(); }
-
-  size_t ApproximateMemoryUsage() const override {
-    assert(index_block_);
-    size_t usage = index_block_->ApproximateMemoryUsage();
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size((void*)this);
-#else
-    usage += sizeof(*this);
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
-    return usage;
-  }
-
- private:
-  PartitionIndexReader(BlockBasedTable* table,
-                       const InternalKeyComparator* icomparator,
-                       std::unique_ptr<Block>&& index_block, Statistics* stats,
-                       const int /*level*/, const bool index_key_includes_seq,
-                       const bool index_value_is_full)
-      : IndexReader(icomparator, stats),
-        table_(table),
-        index_block_(std::move(index_block)),
-        index_key_includes_seq_(index_key_includes_seq),
-        index_value_is_full_(index_value_is_full) {
-    assert(index_block_ != nullptr);
-  }
-  BlockBasedTable* table_;
-  std::unique_ptr<Block> index_block_;
-  std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
-      partition_map_;
-  const bool index_key_includes_seq_;
-  const bool index_value_is_full_;
-};
-
-// Index that allows binary search lookup for the first key of each block.
-// This class can be viewed as a thin wrapper for `Block` class which already
-// supports binary search.
-class BinarySearchIndexReader : public IndexReader {
- public:
-  // Read index from the file and create an intance for
-  // `BinarySearchIndexReader`.
-  // On success, index_reader will be populated; otherwise it will remain
-  // unmodified.
-  static Status Create(RandomAccessFileReader* file,
-                       FilePrefetchBuffer* prefetch_buffer,
-                       const Footer& footer, const BlockHandle& index_handle,
-                       const ImmutableCFOptions& ioptions,
-                       const InternalKeyComparator* icomparator,
-                       IndexReader** index_reader,
-                       const PersistentCacheOptions& cache_options,
-                       const bool index_key_includes_seq,
-                       const bool index_value_is_full,
-                       MemoryAllocator* memory_allocator) {
-    std::unique_ptr<Block> index_block;
-    auto s = ReadBlockFromFile(
-        file, prefetch_buffer, footer, ReadOptions(), index_handle,
-        &index_block, ioptions, true /* decompress */,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, kDisableGlobalSequenceNumber,
-        0 /* read_amp_bytes_per_bit */, memory_allocator);
-
-    if (s.ok()) {
-      *index_reader = new BinarySearchIndexReader(
-          icomparator, std::move(index_block), ioptions.statistics,
-          index_key_includes_seq, index_value_is_full);
-    }
-
-    return s;
-  }
-
-  InternalIteratorBase<BlockHandle>* NewIterator(
-      IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true,
-      bool /*dont_care*/ = true) override {
-    Statistics* kNullStats = nullptr;
-    // We don't return pinned datat from index blocks, so no need
-    // to set `block_contents_pinned`.
-    return index_block_->NewIterator<IndexBlockIter>(
-        icomparator_, icomparator_->user_comparator(), iter, kNullStats, true,
-        index_key_includes_seq_, index_value_is_full_);
-  }
-
-  size_t size() const override { return index_block_->size(); }
-  size_t usable_size() const override { return index_block_->usable_size(); }
-
-  size_t ApproximateMemoryUsage() const override {
-    assert(index_block_);
-    size_t usage = index_block_->ApproximateMemoryUsage();
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size((void*)this);
-#else
-    usage += sizeof(*this);
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-    return usage;
-  }
-
- private:
-  BinarySearchIndexReader(const InternalKeyComparator* icomparator,
-                          std::unique_ptr<Block>&& index_block,
-                          Statistics* stats, const bool index_key_includes_seq,
-                          const bool index_value_is_full)
-      : IndexReader(icomparator, stats),
-        index_block_(std::move(index_block)),
-        index_key_includes_seq_(index_key_includes_seq),
-        index_value_is_full_(index_value_is_full) {
-    assert(index_block_ != nullptr);
-  }
-  std::unique_ptr<Block> index_block_;
-  const bool index_key_includes_seq_;
-  const bool index_value_is_full_;
-};
-
-// Index that leverages an internal hash table to quicken the lookup for a given
-// key.
-class HashIndexReader : public IndexReader {
- public:
-  static Status Create(
-      const SliceTransform* hash_key_extractor, const Footer& footer,
-      RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
-      const ImmutableCFOptions& ioptions,
-      const InternalKeyComparator* icomparator, const BlockHandle& index_handle,
-      InternalIterator* meta_index_iter, IndexReader** index_reader,
-      bool /*hash_index_allow_collision*/,
-      const PersistentCacheOptions& cache_options,
-      const bool index_key_includes_seq, const bool index_value_is_full,
-      MemoryAllocator* memory_allocator) {
-    std::unique_ptr<Block> index_block;
-    auto s = ReadBlockFromFile(
-        file, prefetch_buffer, footer, ReadOptions(), index_handle,
-        &index_block, ioptions, true /* decompress */,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, kDisableGlobalSequenceNumber,
-        0 /* read_amp_bytes_per_bit */, memory_allocator);
-
-    if (!s.ok()) {
-      return s;
-    }
-
-    // Note, failure to create prefix hash index does not need to be a
-    // hard error. We can still fall back to the original binary search index.
-    // So, Create will succeed regardless, from this point on.
-
-    auto new_index_reader = new HashIndexReader(
-        icomparator, std::move(index_block), ioptions.statistics,
-        index_key_includes_seq, index_value_is_full);
-    *index_reader = new_index_reader;
-
-    // Get prefixes block
-    BlockHandle prefixes_handle;
-    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
-                      &prefixes_handle);
-    if (!s.ok()) {
-      // TODO: log error
-      return Status::OK();
-    }
-
-    // Get index metadata block
-    BlockHandle prefixes_meta_handle;
-    s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
-                      &prefixes_meta_handle);
-    if (!s.ok()) {
-      // TODO: log error
-      return Status::OK();
-    }
-
-    // Read contents for the blocks
-    BlockContents prefixes_contents;
-    BlockFetcher prefixes_block_fetcher(
-        file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
-        &prefixes_contents, ioptions, true /*decompress*/,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, memory_allocator);
-    s = prefixes_block_fetcher.ReadBlockContents();
-    if (!s.ok()) {
-      return s;
-    }
-    BlockContents prefixes_meta_contents;
-    BlockFetcher prefixes_meta_block_fetcher(
-        file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
-        &prefixes_meta_contents, ioptions, true /*decompress*/,
-        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-        cache_options, memory_allocator);
-    s = prefixes_meta_block_fetcher.ReadBlockContents();
-    if (!s.ok()) {
-      // TODO: log error
-      return Status::OK();
-    }
-
-    BlockPrefixIndex* prefix_index = nullptr;
-    s = BlockPrefixIndex::Create(hash_key_extractor, prefixes_contents.data,
-                                 prefixes_meta_contents.data, &prefix_index);
-    // TODO: log error
-    if (s.ok()) {
-      new_index_reader->prefix_index_.reset(prefix_index);
-    }
-
-    return Status::OK();
-  }
-
-  InternalIteratorBase<BlockHandle>* NewIterator(
-      IndexBlockIter* iter = nullptr, bool total_order_seek = true,
-      bool /*dont_care*/ = true) override {
-    Statistics* kNullStats = nullptr;
-    // We don't return pinned datat from index blocks, so no need
-    // to set `block_contents_pinned`.
-    return index_block_->NewIterator<IndexBlockIter>(
-        icomparator_, icomparator_->user_comparator(), iter, kNullStats,
-        total_order_seek, index_key_includes_seq_, index_value_is_full_,
-        false /* block_contents_pinned */, prefix_index_.get());
-  }
-
-  size_t size() const override { return index_block_->size(); }
-  size_t usable_size() const override { return index_block_->usable_size(); }
-
-  size_t ApproximateMemoryUsage() const override {
-    assert(index_block_);
-    size_t usage = index_block_->ApproximateMemoryUsage();
-    usage += prefixes_contents_.usable_size();
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    usage += malloc_usable_size((void*)this);
-#else
-    if (prefix_index_) {
-      usage += prefix_index_->ApproximateMemoryUsage();
-    }
-    usage += sizeof(*this);
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-    return usage;
-  }
-
- private:
-  HashIndexReader(const InternalKeyComparator* icomparator,
-                  std::unique_ptr<Block>&& index_block, Statistics* stats,
-                  const bool index_key_includes_seq,
-                  const bool index_value_is_full)
-      : IndexReader(icomparator, stats),
-        index_block_(std::move(index_block)),
-        index_key_includes_seq_(index_key_includes_seq),
-        index_value_is_full_(index_value_is_full) {
-    assert(index_block_ != nullptr);
-  }
-
-  ~HashIndexReader() override {}
-
-  std::unique_ptr<Block> index_block_;
-  std::unique_ptr<BlockPrefixIndex> prefix_index_;
-  BlockContents prefixes_contents_;
-  const bool index_key_includes_seq_;
-  const bool index_value_is_full_;
-};
-
-// Helper function to setup the cache key's prefix for the Table.
-void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) {
-  assert(kMaxCacheKeyPrefixSize >= 10);
-  rep->cache_key_prefix_size = 0;
-  rep->compressed_cache_key_prefix_size = 0;
-  if (rep->table_options.block_cache != nullptr) {
-    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
-                        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
-    // Create dummy offset of index reader which is beyond the file size.
-    rep->dummy_index_reader_offset =
-        file_size + rep->table_options.block_cache->NewId();
-  }
-  if (rep->table_options.persistent_cache != nullptr) {
-    GenerateCachePrefix(/*cache=*/nullptr, rep->file->file(),
-                        &rep->persistent_cache_key_prefix[0],
-                        &rep->persistent_cache_key_prefix_size);
-  }
-  if (rep->table_options.block_cache_compressed != nullptr) {
-    GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
-                        rep->file->file(), &rep->compressed_cache_key_prefix[0],
-                        &rep->compressed_cache_key_prefix_size);
-  }
-}
-
-void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
-                                          char* buffer, size_t* size) {
-  // generate an id from the file
-  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
-
-  // If the prefix wasn't generated or was too long,
-  // create one from the cache.
-  if (cc && *size == 0) {
-    char* end = EncodeVarint64(buffer, cc->NewId());
-    *size = static_cast<size_t>(end - buffer);
-  }
-}
-
-void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
-                                          char* buffer, size_t* size) {
-  // generate an id from the file
-  *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
-
-  // If the prefix wasn't generated or was too long,
-  // create one from the cache.
-  if (*size == 0) {
-    char* end = EncodeVarint64(buffer, cc->NewId());
-    *size = static_cast<size_t>(end - buffer);
-  }
-}
-
-namespace {
-// Return True if table_properties has `user_prop_name` has a `true` value
-// or it doesn't contain this property (for backward compatible).
-bool IsFeatureSupported(const TableProperties& table_properties,
-                        const std::string& user_prop_name, Logger* info_log) {
-  auto& props = table_properties.user_collected_properties;
-  auto pos = props.find(user_prop_name);
-  // Older version doesn't have this value set. Skip this check.
-  if (pos != props.end()) {
-    if (pos->second == kPropFalse) {
-      return false;
-    } else if (pos->second != kPropTrue) {
-      ROCKS_LOG_WARN(info_log, "Property %s has invalidate value %s",
-                     user_prop_name.c_str(), pos->second.c_str());
-    }
-  }
-  return true;
-}
-
-// Caller has to ensure seqno is not nullptr.
-Status GetGlobalSequenceNumber(const TableProperties& table_properties,
-                               SequenceNumber largest_seqno,
-                               SequenceNumber* seqno) {
-  const auto& props = table_properties.user_collected_properties;
-  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
-  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
-
-  *seqno = kDisableGlobalSequenceNumber;
-  if (version_pos == props.end()) {
-    if (seqno_pos != props.end()) {
-      std::array<char, 200> msg_buf;
-      // This is not an external sst file, global_seqno is not supported.
-      snprintf(
-          msg_buf.data(), msg_buf.max_size(),
-          "A non-external sst file have global seqno property with value %s",
-          seqno_pos->second.c_str());
-      return Status::Corruption(msg_buf.data());
-    }
-    return Status::OK();
-  }
-
-  uint32_t version = DecodeFixed32(version_pos->second.c_str());
-  if (version < 2) {
-    if (seqno_pos != props.end() || version != 1) {
-      std::array<char, 200> msg_buf;
-      // This is a v1 external sst file, global_seqno is not supported.
-      snprintf(msg_buf.data(), msg_buf.max_size(),
-               "An external sst file with version %u have global seqno "
-               "property with value %s",
-               version, seqno_pos->second.c_str());
-      return Status::Corruption(msg_buf.data());
-    }
-    return Status::OK();
-  }
-
-  // Since we have a plan to deprecate global_seqno, we do not return failure
-  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
-  // SST is external.
-  SequenceNumber global_seqno(0);
-  if (seqno_pos != props.end()) {
-    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
-  }
-  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
-  // to denote it is unknown.
-  if (largest_seqno < kMaxSequenceNumber) {
-    if (global_seqno == 0) {
-      global_seqno = largest_seqno;
-    }
-    if (global_seqno != largest_seqno) {
-      std::array<char, 200> msg_buf;
-      snprintf(
-          msg_buf.data(), msg_buf.max_size(),
-          "An external sst file with version %u have global seqno property "
-          "with value %s, while largest seqno in the file is %llu",
-          version, seqno_pos->second.c_str(),
-          static_cast<unsigned long long>(largest_seqno));
-      return Status::Corruption(msg_buf.data());
-    }
-  }
-  *seqno = global_seqno;
-
-  if (global_seqno > kMaxSequenceNumber) {
-    std::array<char, 200> msg_buf;
-    snprintf(msg_buf.data(), msg_buf.max_size(),
-             "An external sst file with version %u have global seqno property "
-             "with value %llu, which is greater than kMaxSequenceNumber",
-             version, static_cast<unsigned long long>(global_seqno));
-    return Status::Corruption(msg_buf.data());
-  }
-
-  return Status::OK();
-}
-}  // namespace
-
-Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
-                                   size_t cache_key_prefix_size,
-                                   const BlockHandle& handle, char* cache_key) {
-  assert(cache_key != nullptr);
-  assert(cache_key_prefix_size != 0);
-  assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
-  memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
-  char* end =
-      EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
-  return Slice(cache_key, static_cast<size_t>(end - cache_key));
-}
-
-Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
-                             const EnvOptions& env_options,
-                             const BlockBasedTableOptions& table_options,
-                             const InternalKeyComparator& internal_comparator,
-                             std::unique_ptr<RandomAccessFileReader>&& file,
-                             uint64_t file_size,
-                             std::unique_ptr<TableReader>* table_reader,
-                             const SliceTransform* prefix_extractor,
-                             const bool prefetch_index_and_filter_in_cache,
-                             const bool skip_filters, const int level,
-                             const bool immortal_table,
-                             const SequenceNumber largest_seqno,
-                             TailPrefetchStats* tail_prefetch_stats) {
-  table_reader->reset();
-
-  Status s;
-  Footer footer;
-  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
-
-  // prefetch both index and filters, down to all partitions
-  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
-  const bool preload_all = !table_options.cache_index_and_filter_blocks;
-
-  s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
-                   preload_all, &prefetch_buffer);
-
-  // Read in the following order:
-  //    1. Footer
-  //    2. [metaindex block]
-  //    3. [meta block: properties]
-  //    4. [meta block: range deletion tombstone]
-  //    5. [meta block: compression dictionary]
-  //    6. [meta block: index]
-  //    7. [meta block: filter]
-  s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
-                         kBlockBasedTableMagicNumber);
-  if (!s.ok()) {
-    return s;
-  }
-  if (!BlockBasedTableSupportedVersion(footer.version())) {
-    return Status::Corruption(
-        "Unknown Footer version. Maybe this file was created with newer "
-        "version of RocksDB?");
-  }
-
-  // We've successfully read the footer. We are ready to serve requests.
-  // Better not mutate rep_ after the creation. eg. internal_prefix_transform
-  // raw pointer will be used to create HashIndexReader, whose reset may
-  // access a dangling pointer.
-  Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
-                                      internal_comparator, skip_filters, level,
-                                      immortal_table);
-  rep->file = std::move(file);
-  rep->footer = footer;
-  rep->index_type = table_options.index_type;
-  rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
-  // We need to wrap data with internal_prefix_transform to make sure it can
-  // handle prefix correctly.
-  rep->internal_prefix_transform.reset(
-      new InternalKeySliceTransform(prefix_extractor));
-  SetupCacheKeyPrefix(rep, file_size);
-  std::unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
-
-  // page cache options
-  rep->persistent_cache_options =
-      PersistentCacheOptions(rep->table_options.persistent_cache,
-                             std::string(rep->persistent_cache_key_prefix,
-                                         rep->persistent_cache_key_prefix_size),
-                             rep->ioptions.statistics);
-
-  // Read metaindex
-  std::unique_ptr<Block> meta;
-  std::unique_ptr<InternalIterator> meta_iter;
-  s = ReadMetaBlock(rep, prefetch_buffer.get(), &meta, &meta_iter);
-  if (!s.ok()) {
-    return s;
-  }
-
-  s = ReadPropertiesBlock(rep, prefetch_buffer.get(), meta_iter.get(),
-                          largest_seqno);
-  if (!s.ok()) {
-    return s;
-  }
-  s = ReadRangeDelBlock(rep, prefetch_buffer.get(), meta_iter.get(),
-                        internal_comparator);
-  if (!s.ok()) {
-    return s;
-  }
-  s = PrefetchIndexAndFilterBlocks(rep, prefetch_buffer.get(), meta_iter.get(),
-                                   new_table.get(), prefix_extractor,
-                                   prefetch_all, table_options, level,
-                                   prefetch_index_and_filter_in_cache);
-
-  if (s.ok()) {
-    // Update tail prefetch stats
-    assert(prefetch_buffer.get() != nullptr);
-    if (tail_prefetch_stats != nullptr) {
-      assert(prefetch_buffer->min_offset_read() < file_size);
-      tail_prefetch_stats->RecordEffectiveSize(
-          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
-    }
-
-    *table_reader = std::move(new_table);
-  }
-
-  return s;
-}
-
-Status BlockBasedTable::PrefetchTail(
-    RandomAccessFileReader* file, uint64_t file_size,
-    TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
-    const bool preload_all,
-    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
-  size_t tail_prefetch_size = 0;
-  if (tail_prefetch_stats != nullptr) {
-    // Multiple threads may get a 0 (no history) when running in parallel,
-    // but it will get cleared after the first of them finishes.
-    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
-  }
-  if (tail_prefetch_size == 0) {
-    // Before read footer, readahead backwards to prefetch data. Do more
-    // readahead if we're going to read index/filter.
-    // TODO: This may incorrectly select small readahead in case partitioned
-    // index/filter is enabled and top-level partition pinning is enabled.
-    // That's because we need to issue readahead before we read the properties,
-    // at which point we don't yet know the index type.
-    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
-  }
-  size_t prefetch_off;
-  size_t prefetch_len;
-  if (file_size < tail_prefetch_size) {
-    prefetch_off = 0;
-    prefetch_len = static_cast<size_t>(file_size);
-  } else {
-    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
-    prefetch_len = tail_prefetch_size;
-  }
-  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
-                           &tail_prefetch_size);
-  Status s;
-  // TODO should not have this special logic in the future.
-  if (!file->use_direct_io()) {
-    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true));
-    s = file->Prefetch(prefetch_off, prefetch_len);
-  } else {
-    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
-    s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
-  }
-  return s;
-}
-
-Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
-                      uint32_t expected) {
-  Status s;
-  uint32_t actual = 0;
-  switch (type) {
-    case kNoChecksum:
-      break;
-    case kCRC32c:
-      expected = crc32c::Unmask(expected);
-      actual = crc32c::Value(buf, len);
-      break;
-    case kxxHash:
-      actual = XXH32(buf, static_cast<int>(len), 0);
-      break;
-    case kxxHash64:
-      actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
-                                     uint64_t{0xffffffff});
-      break;
-    default:
-      s = Status::Corruption("unknown checksum type");
-  }
-  if (s.ok() && actual != expected) {
-    s = Status::Corruption("properties block checksum mismatched");
-  }
-  return s;
-}
-
-Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
-    TableProperties** table_properties) {
-  assert(table_properties != nullptr);
-  // If this is an external SST file ingested with write_global_seqno set to
-  // true, then we expect the checksum mismatch because checksum was written
-  // by SstFileWriter, but its global seqno in the properties block may have
-  // been changed during ingestion. In this case, we read the properties
-  // block, copy it to a memory buffer, change the global seqno to its
-  // original value, i.e. 0, and verify the checksum again.
-  BlockHandle props_block_handle;
-  CacheAllocationPtr tmp_buf;
-  Status s = ReadProperties(handle_value, rep->file.get(), prefetch_buffer,
-                            rep->footer, rep->ioptions, table_properties,
-                            false /* verify_checksum */, &props_block_handle,
-                            &tmp_buf, false /* compression_type_missing */,
-                            nullptr /* memory_allocator */);
-  if (s.ok() && tmp_buf) {
-    const auto seqno_pos_iter =
-        (*table_properties)
-            ->properties_offsets.find(
-                ExternalSstFilePropertyNames::kGlobalSeqno);
-    size_t block_size = static_cast<size_t>(props_block_handle.size());
-    if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
-      uint64_t global_seqno_offset = seqno_pos_iter->second;
-      EncodeFixed64(
-          tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
-    }
-    uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
-    s = rocksdb::VerifyChecksum(rep->footer.checksum(), tmp_buf.get(),
-                                block_size + 1, value);
-  }
-  return s;
-}
-
-Status BlockBasedTable::ReadPropertiesBlock(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
-    const SequenceNumber largest_seqno) {
-  bool found_properties_block = true;
-  Status s;
-  s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
-
-  if (!s.ok()) {
-    ROCKS_LOG_WARN(rep->ioptions.info_log,
-                   "Error when seeking to properties block from file: %s",
-                   s.ToString().c_str());
-  } else if (found_properties_block) {
-    s = meta_iter->status();
-    TableProperties* table_properties = nullptr;
-    if (s.ok()) {
-      s = ReadProperties(
-          meta_iter->value(), rep->file.get(), prefetch_buffer, rep->footer,
-          rep->ioptions, &table_properties, true /* verify_checksum */,
-          nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
-          false /* compression_type_missing */, nullptr /* memory_allocator */);
-    }
-
-    if (s.IsCorruption()) {
-      s = TryReadPropertiesWithGlobalSeqno(
-          rep, prefetch_buffer, meta_iter->value(), &table_properties);
-    }
-    std::unique_ptr<TableProperties> props_guard;
-    if (table_properties != nullptr) {
-      props_guard.reset(table_properties);
-    }
-
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep->ioptions.info_log,
-                     "Encountered error while reading data from properties "
-                     "block %s",
-                     s.ToString().c_str());
-    } else {
-      assert(table_properties != nullptr);
-      rep->table_properties.reset(props_guard.release());
-      rep->blocks_maybe_compressed = rep->table_properties->compression_name !=
-                                     CompressionTypeToString(kNoCompression);
-      rep->blocks_definitely_zstd_compressed =
-          (rep->table_properties->compression_name ==
-               CompressionTypeToString(kZSTD) ||
-           rep->table_properties->compression_name ==
-               CompressionTypeToString(kZSTDNotFinalCompression));
-    }
-  } else {
-    ROCKS_LOG_ERROR(rep->ioptions.info_log,
-                    "Cannot find Properties block from file.");
-  }
-#ifndef ROCKSDB_LITE
-  if (rep->table_properties) {
-    ParseSliceTransform(rep->table_properties->prefix_extractor_name,
-                        &(rep->table_prefix_extractor));
-  }
-#endif  // ROCKSDB_LITE
-
-  // Read the table properties, if provided.
-  if (rep->table_properties) {
-    rep->whole_key_filtering &=
-        IsFeatureSupported(*(rep->table_properties),
-                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
-                           rep->ioptions.info_log);
-    rep->prefix_filtering &= IsFeatureSupported(
-        *(rep->table_properties),
-        BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log);
-
-    s = GetGlobalSequenceNumber(*(rep->table_properties), largest_seqno,
-                                &(rep->global_seqno));
-    if (!s.ok()) {
-      ROCKS_LOG_ERROR(rep->ioptions.info_log, "%s", s.ToString().c_str());
-    }
-  }
-  return s;
-}
-
-Status BlockBasedTable::ReadRangeDelBlock(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
-    const InternalKeyComparator& internal_comparator) {
-  Status s;
-  bool found_range_del_block;
-  BlockHandle range_del_handle;
-  s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
-  if (!s.ok()) {
-    ROCKS_LOG_WARN(
-        rep->ioptions.info_log,
-        "Error when seeking to range delete tombstones block from file: %s",
-        s.ToString().c_str());
-  } else if (found_range_del_block && !range_del_handle.IsNull()) {
-    ReadOptions read_options;
-    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
-        rep, read_options, range_del_handle, nullptr /* input_iter */,
-        false /* is_index */, true /* key_includes_seq */,
-        true /* index_key_is_full */, nullptr /* get_context */, Status(),
-        prefetch_buffer));
-    assert(iter != nullptr);
-    s = iter->status();
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(
-          rep->ioptions.info_log,
-          "Encountered error while reading data from range del block %s",
-          s.ToString().c_str());
-    } else {
-      rep->fragmented_range_dels =
-          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
-                                                         internal_comparator);
-    }
-  }
-  return s;
-}
-
-Status BlockBasedTable::ReadCompressionDictBlock(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer,
-    std::unique_ptr<const BlockContents>* compression_dict_block) {
-  assert(compression_dict_block != nullptr);
-  Status s;
-  if (!rep->compression_dict_handle.IsNull()) {
-    std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()};
-    PersistentCacheOptions cache_options;
-    ReadOptions read_options;
-    read_options.verify_checksums = true;
-    BlockFetcher compression_block_fetcher(
-        rep->file.get(), prefetch_buffer, rep->footer, read_options,
-        rep->compression_dict_handle, compression_dict_cont.get(),
-        rep->ioptions, false /* decompress */, false /*maybe_compressed*/,
-        UncompressionDict::GetEmptyDict(), cache_options);
-    s = compression_block_fetcher.ReadBlockContents();
-
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(
-          rep->ioptions.info_log,
-          "Encountered error while reading data from compression dictionary "
-          "block %s",
-          s.ToString().c_str());
-    } else {
-      *compression_dict_block = std::move(compression_dict_cont);
-    }
-  }
-  return s;
-}
-
-Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
-    Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
-    BlockBasedTable* new_table, const SliceTransform* prefix_extractor,
-    bool prefetch_all, const BlockBasedTableOptions& table_options,
-    const int level, const bool prefetch_index_and_filter_in_cache) {
-  Status s;
-
-  // Find filter handle and filter type
-  if (rep->filter_policy) {
-    for (auto filter_type :
-         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
-          Rep::FilterType::kBlockFilter}) {
-      std::string prefix;
-      switch (filter_type) {
-        case Rep::FilterType::kFullFilter:
-          prefix = kFullFilterBlockPrefix;
-          break;
-        case Rep::FilterType::kPartitionedFilter:
-          prefix = kPartitionedFilterBlockPrefix;
-          break;
-        case Rep::FilterType::kBlockFilter:
-          prefix = kFilterBlockPrefix;
-          break;
-        default:
-          assert(0);
-      }
-      std::string filter_block_key = prefix;
-      filter_block_key.append(rep->filter_policy->Name());
-      if (FindMetaBlock(meta_iter, filter_block_key, &rep->filter_handle)
-              .ok()) {
-        rep->filter_type = filter_type;
-        break;
-      }
-    }
-  }
-
-  {
-    // Find compression dictionary handle
-    bool found_compression_dict;
-    s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
-                                   &rep->compression_dict_handle);
-  }
-
-  bool need_upper_bound_check =
-      PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor);
-
-  BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
-  // prefetch the first level of index
-  const bool prefetch_index =
-      prefetch_all ||
-      (table_options.pin_top_level_index_and_filter &&
-       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
-  // prefetch the first level of filter
-  const bool prefetch_filter =
-      prefetch_all || (table_options.pin_top_level_index_and_filter &&
-                       rep->filter_type == Rep::FilterType::kPartitionedFilter);
-  // Partition fitlers cannot be enabled without partition indexes
-  assert(!prefetch_filter || prefetch_index);
-  // pin both index and filters, down to all partitions
-  const bool pin_all =
-      rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
-  // pin the first level of index
-  const bool pin_index =
-      pin_all || (table_options.pin_top_level_index_and_filter &&
-                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
-  // pin the first level of filter
-  const bool pin_filter =
-      pin_all || (table_options.pin_top_level_index_and_filter &&
-                  rep->filter_type == Rep::FilterType::kPartitionedFilter);
-  // pre-fetching of blocks is turned on
-  // Will use block cache for meta-blocks access
-  // Always prefetch index and filter for level 0
-  // TODO(ajkr): also prefetch compression dictionary block
-  if (table_options.cache_index_and_filter_blocks) {
-    assert(table_options.block_cache != nullptr);
-    if (prefetch_index) {
-      // Hack: Call NewIndexIterator() to implicitly add index to the
-      // block_cache
-      CachableEntry<IndexReader> index_entry;
-      // check prefix_extractor match only if hash based index is used
-      bool disable_prefix_seek =
-          rep->index_type == BlockBasedTableOptions::kHashSearch &&
-          need_upper_bound_check;
-      if (s.ok()) {
-        std::unique_ptr<InternalIteratorBase<BlockHandle>> iter(
-            new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek,
-                                        nullptr, &index_entry));
-        s = iter->status();
-      }
-      if (s.ok()) {
-        // This is the first call to NewIndexIterator() since we're in Open().
-        // On success it should give us ownership of the `CachableEntry` by
-        // populating `index_entry`.
-        assert(index_entry.value != nullptr);
-        if (prefetch_all) {
-          index_entry.value->CacheDependencies(pin_all);
-        }
-        if (pin_index) {
-          rep->index_entry = std::move(index_entry);
-        } else {
-          index_entry.Release(table_options.block_cache.get());
-        }
-      }
-    }
-    if (s.ok() && prefetch_filter) {
-      // Hack: Call GetFilter() to implicitly add filter to the block_cache
-      auto filter_entry =
-          new_table->GetFilter(rep->table_prefix_extractor.get());
-      if (filter_entry.value != nullptr && prefetch_all) {
-        filter_entry.value->CacheDependencies(
-            pin_all, rep->table_prefix_extractor.get());
-      }
-      // if pin_filter is true then save it in rep_->filter_entry; it will be
-      // released in the destructor only, hence it will be pinned in the
-      // cache while this reader is alive
-      if (pin_filter) {
-        rep->filter_entry = filter_entry;
-      } else {
-        filter_entry.Release(table_options.block_cache.get());
-      }
-    }
-  } else {
-    // If we don't use block cache for meta-block access, we'll pre-load these
-    // blocks, which will kept in member variables in Rep and with a same life-
-    // time as this table object.
-    IndexReader* index_reader = nullptr;
-    if (s.ok()) {
-      s = new_table->CreateIndexReader(prefetch_buffer, &index_reader,
-                                       meta_iter, level);
-    }
-    std::unique_ptr<const BlockContents> compression_dict_block;
-    if (s.ok()) {
-      rep->index_reader.reset(index_reader);
-      // The partitions of partitioned index are always stored in cache. They
-      // are hence follow the configuration for pin and prefetch regardless of
-      // the value of cache_index_and_filter_blocks
-      if (prefetch_index_and_filter_in_cache || level == 0) {
-        rep->index_reader->CacheDependencies(pin_all);
-      }
-
-      // Set filter block
-      if (rep->filter_policy) {
-        const bool is_a_filter_partition = true;
-        auto filter = new_table->ReadFilter(prefetch_buffer, rep->filter_handle,
-                                            !is_a_filter_partition,
-                                            rep->table_prefix_extractor.get());
-        rep->filter.reset(filter);
-        // Refer to the comment above about paritioned indexes always being
-        // cached
-        if (filter && (prefetch_index_and_filter_in_cache || level == 0)) {
-          filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get());
-        }
-      }
-      s = ReadCompressionDictBlock(rep, prefetch_buffer,
-                                   &compression_dict_block);
-    } else {
-      delete index_reader;
-    }
-    if (s.ok() && !rep->compression_dict_handle.IsNull()) {
-      assert(compression_dict_block != nullptr);
-      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
-      rep->uncompression_dict.reset(new UncompressionDict(
-          compression_dict_block->data.ToString(),
-          rep->blocks_definitely_zstd_compressed, rep->ioptions.statistics));
-    }
-  }
-  return s;
-}
-
-void BlockBasedTable::SetupForCompaction() {
-  switch (rep_->ioptions.access_hint_on_compaction_start) {
-    case Options::NONE:
-      break;
-    case Options::NORMAL:
-      rep_->file->file()->Hint(RandomAccessFile::NORMAL);
-      break;
-    case Options::SEQUENTIAL:
-      rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
-      break;
-    case Options::WILLNEED:
-      rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
-      break;
-    default:
-      assert(false);
-  }
-}
-
-std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
-    const {
-  return rep_->table_properties;
-}
-
-size_t BlockBasedTable::ApproximateMemoryUsage() const {
-  size_t usage = 0;
-  if (rep_->filter) {
-    usage += rep_->filter->ApproximateMemoryUsage();
-  }
-  if (rep_->index_reader) {
-    usage += rep_->index_reader->ApproximateMemoryUsage();
-  }
-  if (rep_->uncompression_dict) {
-    usage += rep_->uncompression_dict->ApproximateMemoryUsage();
-  }
-  return usage;
-}
-
-// Load the meta-block from the file. On success, return the loaded meta block
-// and its iterator.
-Status BlockBasedTable::ReadMetaBlock(Rep* rep,
-                                      FilePrefetchBuffer* prefetch_buffer,
-                                      std::unique_ptr<Block>* meta_block,
-                                      std::unique_ptr<InternalIterator>* iter) {
-  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
-  // it is an empty block.
-  std::unique_ptr<Block> meta;
-  Status s = ReadBlockFromFile(
-      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
-      rep->footer.metaindex_handle(), &meta, rep->ioptions,
-      true /* decompress */, true /*maybe_compressed*/,
-      UncompressionDict::GetEmptyDict(), rep->persistent_cache_options,
-      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
-      GetMemoryAllocator(rep->table_options));
-
-  if (!s.ok()) {
-    ROCKS_LOG_ERROR(rep->ioptions.info_log,
-                    "Encountered error while reading data from properties"
-                    " block %s",
-                    s.ToString().c_str());
-    return s;
-  }
-
-  *meta_block = std::move(meta);
-  // meta block uses bytewise comparator.
-  iter->reset(meta_block->get()->NewIterator<DataBlockIter>(
-      BytewiseComparator(), BytewiseComparator()));
-  return Status::OK();
-}
-
-Status BlockBasedTable::GetDataBlockFromCache(
-    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-    Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
-    const ReadOptions& read_options,
-    BlockBasedTable::CachableEntry<Block>* block,
-    const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit,
-    bool is_index, GetContext* get_context) {
-  Status s;
-  BlockContents* compressed_block = nullptr;
-  Cache::Handle* block_cache_compressed_handle = nullptr;
-  Statistics* statistics = rep->ioptions.statistics;
-
-  // Lookup uncompressed cache first
-  if (block_cache != nullptr) {
-    block->cache_handle = GetEntryFromCache(
-        block_cache, block_cache_key, rep->level,
-        is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS,
-        is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT,
-        get_context
-            ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss
-                        : &get_context->get_context_stats_.num_cache_data_miss)
-            : nullptr,
-        get_context
-            ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit
-                        : &get_context->get_context_stats_.num_cache_data_hit)
-            : nullptr,
-        statistics, get_context);
-    if (block->cache_handle != nullptr) {
-      block->value =
-          reinterpret_cast<Block*>(block_cache->Value(block->cache_handle));
-      return s;
-    }
-  }
-
-  // If not found, search from the compressed block cache.
-  assert(block->cache_handle == nullptr && block->value == nullptr);
-
-  if (block_cache_compressed == nullptr) {
-    return s;
-  }
-
-  assert(!compressed_block_cache_key.empty());
-  block_cache_compressed_handle =
-      block_cache_compressed->Lookup(compressed_block_cache_key);
-  // if we found in the compressed cache, then uncompress and insert into
-  // uncompressed cache
-  if (block_cache_compressed_handle == nullptr) {
-    RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
-    return s;
-  }
-
-  // found compressed block
-  RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
-  compressed_block = reinterpret_cast<BlockContents*>(
-      block_cache_compressed->Value(block_cache_compressed_handle));
-  CompressionType compression_type = compressed_block->get_compression_type();
-  assert(compression_type != kNoCompression);
-
-  // Retrieve the uncompressed contents into a new buffer
-  BlockContents contents;
-  UncompressionContext context(compression_type);
-  UncompressionInfo info(context, uncompression_dict, compression_type);
-  s = UncompressBlockContents(info, compressed_block->data.data(),
-                              compressed_block->data.size(), &contents,
-                              rep->table_options.format_version, rep->ioptions,
-                              GetMemoryAllocator(rep->table_options));
-
-  // Insert uncompressed block into block cache
-  if (s.ok()) {
-    block->value =
-        new Block(std::move(contents), rep->get_global_seqno(is_index),
-                  read_amp_bytes_per_bit,
-                  statistics);  // uncompressed block
-    if (block_cache != nullptr && block->value->own_bytes() &&
-        read_options.fill_cache) {
-      size_t charge = block->value->ApproximateMemoryUsage();
-      s = block_cache->Insert(block_cache_key, block->value, charge,
-                              &DeleteCachedEntry<Block>,
-                              &(block->cache_handle));
-#ifndef NDEBUG
-      block_cache->TEST_mark_as_data_block(block_cache_key, charge);
-#endif  // NDEBUG
-      if (s.ok()) {
-        if (get_context != nullptr) {
-          get_context->get_context_stats_.num_cache_add++;
-          get_context->get_context_stats_.num_cache_bytes_write += charge;
-        } else {
-          RecordTick(statistics, BLOCK_CACHE_ADD);
-          RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
-        }
-        if (is_index) {
-          if (get_context != nullptr) {
-            get_context->get_context_stats_.num_cache_index_add++;
-            get_context->get_context_stats_.num_cache_index_bytes_insert +=
-                charge;
-          } else {
-            RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
-            RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
-          }
-        } else {
-          if (get_context != nullptr) {
-            get_context->get_context_stats_.num_cache_data_add++;
-            get_context->get_context_stats_.num_cache_data_bytes_insert +=
-                charge;
-          } else {
-            RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
-            RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
-          }
-        }
-      } else {
-        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
-        delete block->value;
-        block->value = nullptr;
-      }
-    }
-  }
-
-  // Release hold on compressed cache entry
-  block_cache_compressed->Release(block_cache_compressed_handle);
-  return s;
-}
-
-Status BlockBasedTable::PutDataBlockToCache(
-    const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-    Cache* block_cache, Cache* block_cache_compressed,
-    const ReadOptions& /*read_options*/, const ImmutableCFOptions& ioptions,
-    CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
-    CompressionType raw_block_comp_type, uint32_t format_version,
-    const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
-    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
-    bool is_index, Cache::Priority priority, GetContext* get_context) {
-  assert(raw_block_comp_type == kNoCompression ||
-         block_cache_compressed != nullptr);
-
-  Status s;
-  // Retrieve the uncompressed contents into a new buffer
-  BlockContents uncompressed_block_contents;
-  Statistics* statistics = ioptions.statistics;
-  if (raw_block_comp_type != kNoCompression) {
-    UncompressionContext context(raw_block_comp_type);
-    UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
-    s = UncompressBlockContents(info, raw_block_contents->data.data(),
-                                raw_block_contents->data.size(),
-                                &uncompressed_block_contents, format_version,
-                                ioptions, memory_allocator);
-  }
-  if (!s.ok()) {
-    return s;
-  }
-
-  if (raw_block_comp_type != kNoCompression) {
-    cached_block->value = new Block(std::move(uncompressed_block_contents),
-                                    seq_no, read_amp_bytes_per_bit,
-                                    statistics);  // uncompressed block
-  } else {
-    cached_block->value =
-        new Block(std::move(*raw_block_contents), seq_no,
-                  read_amp_bytes_per_bit, ioptions.statistics);
-  }
-
-  // Insert compressed block into compressed block cache.
-  // Release the hold on the compressed cache entry immediately.
-  if (block_cache_compressed != nullptr &&
-      raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
-      raw_block_contents->own_bytes()) {
-#ifndef NDEBUG
-    assert(raw_block_contents->is_raw_block);
-#endif  // NDEBUG
-
-    // We cannot directly put raw_block_contents because this could point to
-    // an object in the stack.
-    BlockContents* block_cont_for_comp_cache =
-        new BlockContents(std::move(*raw_block_contents));
-    s = block_cache_compressed->Insert(
-        compressed_block_cache_key, block_cont_for_comp_cache,
-        block_cont_for_comp_cache->ApproximateMemoryUsage(),
-        &DeleteCachedEntry<BlockContents>);
-    if (s.ok()) {
-      // Avoid the following code to delete this cached block.
-      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
-    } else {
-      RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
-      delete block_cont_for_comp_cache;
-    }
-  }
-
-  // insert into uncompressed block cache
-  if (block_cache != nullptr && cached_block->value->own_bytes()) {
-    size_t charge = cached_block->value->ApproximateMemoryUsage();
-    s = block_cache->Insert(block_cache_key, cached_block->value, charge,
-                            &DeleteCachedEntry<Block>,
-                            &(cached_block->cache_handle), priority);
-#ifndef NDEBUG
-    block_cache->TEST_mark_as_data_block(block_cache_key, charge);
-#endif  // NDEBUG
-    if (s.ok()) {
-      assert(cached_block->cache_handle != nullptr);
-      if (get_context != nullptr) {
-        get_context->get_context_stats_.num_cache_add++;
-        get_context->get_context_stats_.num_cache_bytes_write += charge;
-      } else {
-        RecordTick(statistics, BLOCK_CACHE_ADD);
-        RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
-      }
-      if (is_index) {
-        if (get_context != nullptr) {
-          get_context->get_context_stats_.num_cache_index_add++;
-          get_context->get_context_stats_.num_cache_index_bytes_insert +=
-              charge;
-        } else {
-          RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
-          RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
-        }
-      } else {
-        if (get_context != nullptr) {
-          get_context->get_context_stats_.num_cache_data_add++;
-          get_context->get_context_stats_.num_cache_data_bytes_insert += charge;
-        } else {
-          RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
-          RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
-        }
-      }
-      assert(reinterpret_cast<Block*>(block_cache->Value(
-                 cached_block->cache_handle)) == cached_block->value);
-    } else {
-      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
-      delete cached_block->value;
-      cached_block->value = nullptr;
-    }
-  }
-
-  return s;
-}
-
-FilterBlockReader* BlockBasedTable::ReadFilter(
-    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
-    const bool is_a_filter_partition,
-    const SliceTransform* prefix_extractor) const {
-  auto& rep = rep_;
-  // TODO: We might want to unify with ReadBlockFromFile() if we start
-  // requiring checksum verification in Table::Open.
-  if (rep->filter_type == Rep::FilterType::kNoFilter) {
-    return nullptr;
-  }
-  BlockContents block;
-
-  BlockFetcher block_fetcher(
-      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
-      filter_handle, &block, rep->ioptions, false /* decompress */,
-      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-      rep->persistent_cache_options, GetMemoryAllocator(rep->table_options));
-  Status s = block_fetcher.ReadBlockContents();
-
-  if (!s.ok()) {
-    // Error reading the block
-    return nullptr;
-  }
-
-  assert(rep->filter_policy);
-
-  auto filter_type = rep->filter_type;
-  if (rep->filter_type == Rep::FilterType::kPartitionedFilter &&
-      is_a_filter_partition) {
-    filter_type = Rep::FilterType::kFullFilter;
-  }
-
-  switch (filter_type) {
-    case Rep::FilterType::kPartitionedFilter: {
-      return new PartitionedFilterBlockReader(
-          rep->prefix_filtering ? prefix_extractor : nullptr,
-          rep->whole_key_filtering, std::move(block), nullptr,
-          rep->ioptions.statistics, rep->internal_comparator, this,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0);
-    }
-
-    case Rep::FilterType::kBlockFilter:
-      return new BlockBasedFilterBlockReader(
-          rep->prefix_filtering ? prefix_extractor : nullptr,
-          rep->table_options, rep->whole_key_filtering, std::move(block),
-          rep->ioptions.statistics);
-
-    case Rep::FilterType::kFullFilter: {
-      auto filter_bits_reader =
-          rep->filter_policy->GetFilterBitsReader(block.data);
-      assert(filter_bits_reader != nullptr);
-      return new FullFilterBlockReader(
-          rep->prefix_filtering ? prefix_extractor : nullptr,
-          rep->whole_key_filtering, std::move(block), filter_bits_reader,
-          rep->ioptions.statistics);
-    }
-
-    default:
-      // filter_type is either kNoFilter (exited the function at the first if),
-      // or it must be covered in this switch block
-      assert(false);
-      return nullptr;
-  }
-}
-
-BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
-    const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
-    bool no_io, GetContext* get_context) const {
-  const BlockHandle& filter_blk_handle = rep_->filter_handle;
-  const bool is_a_filter_partition = true;
-  return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
-                   no_io, get_context, prefix_extractor);
-}
-
-BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
-    FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
-    const bool is_a_filter_partition, bool no_io, GetContext* get_context,
-    const SliceTransform* prefix_extractor) const {
-  // If cache_index_and_filter_blocks is false, filter should be pre-populated.
-  // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
-  // read fails at Open() time. We don't want to reload again since it will
-  // most probably fail again.
-  if (!is_a_filter_partition &&
-      !rep_->table_options.cache_index_and_filter_blocks) {
-    return {rep_->filter.get(), nullptr /* cache handle */};
-  }
-
-  Cache* block_cache = rep_->table_options.block_cache.get();
-  if (rep_->filter_policy == nullptr /* do not use filter */ ||
-      block_cache == nullptr /* no block cache at all */) {
-    return {nullptr /* filter */, nullptr /* cache handle */};
-  }
-
-  if (!is_a_filter_partition && rep_->filter_entry.IsSet()) {
-    return rep_->filter_entry;
-  }
-
-  PERF_TIMER_GUARD(read_filter_block_nanos);
-
-  // Fetching from the cache
-  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                         filter_blk_handle, cache_key);
-
-  Statistics* statistics = rep_->ioptions.statistics;
-  auto cache_handle = GetEntryFromCache(
-      block_cache, key, rep_->level, BLOCK_CACHE_FILTER_MISS,
-      BLOCK_CACHE_FILTER_HIT,
-      get_context ? &get_context->get_context_stats_.num_cache_filter_miss
-                  : nullptr,
-      get_context ? &get_context->get_context_stats_.num_cache_filter_hit
-                  : nullptr,
-      statistics, get_context);
-
-  FilterBlockReader* filter = nullptr;
-  if (cache_handle != nullptr) {
-    PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
-    filter =
-        reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
-  } else if (no_io) {
-    // Do not invoke any io.
-    return CachableEntry<FilterBlockReader>();
-  } else {
-    filter = ReadFilter(prefetch_buffer, filter_blk_handle,
-                        is_a_filter_partition, prefix_extractor);
-    if (filter != nullptr) {
-      size_t usage = filter->ApproximateMemoryUsage();
-      Status s = block_cache->Insert(
-          key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
-          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
-              ? Cache::Priority::HIGH
-              : Cache::Priority::LOW);
-      if (s.ok()) {
-        PERF_COUNTER_ADD(filter_block_read_count, 1);
-        if (get_context != nullptr) {
-          get_context->get_context_stats_.num_cache_add++;
-          get_context->get_context_stats_.num_cache_bytes_write += usage;
-          get_context->get_context_stats_.num_cache_filter_add++;
-          get_context->get_context_stats_.num_cache_filter_bytes_insert +=
-              usage;
-        } else {
-          RecordTick(statistics, BLOCK_CACHE_ADD);
-          RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
-          RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
-          RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
-        }
-      } else {
-        RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
-        delete filter;
-        return CachableEntry<FilterBlockReader>();
-      }
-    }
-  }
-
-  return {filter, cache_handle};
-}
-
-BlockBasedTable::CachableEntry<UncompressionDict>
-BlockBasedTable::GetUncompressionDict(Rep* rep,
-                                      FilePrefetchBuffer* prefetch_buffer,
-                                      bool no_io, GetContext* get_context) {
-  if (!rep->table_options.cache_index_and_filter_blocks) {
-    // block cache is either disabled or not used for meta-blocks. In either
-    // case, BlockBasedTableReader is the owner of the uncompression dictionary.
-    return {rep->uncompression_dict.get(), nullptr /* cache handle */};
-  }
-  if (rep->compression_dict_handle.IsNull()) {
-    return {nullptr, nullptr};
-  }
-  char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto cache_key =
-      GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
-                  rep->compression_dict_handle, cache_key_buf);
-  auto cache_handle = GetEntryFromCache(
-      rep->table_options.block_cache.get(), cache_key, rep->level,
-      BLOCK_CACHE_COMPRESSION_DICT_MISS, BLOCK_CACHE_COMPRESSION_DICT_HIT,
-      get_context
-          ? &get_context->get_context_stats_.num_cache_compression_dict_miss
-          : nullptr,
-      get_context
-          ? &get_context->get_context_stats_.num_cache_compression_dict_hit
-          : nullptr,
-      rep->ioptions.statistics, get_context);
-  UncompressionDict* dict = nullptr;
-  if (cache_handle != nullptr) {
-    dict = reinterpret_cast<UncompressionDict*>(
-        rep->table_options.block_cache->Value(cache_handle));
-  } else if (no_io) {
-    // Do not invoke any io.
-  } else {
-    std::unique_ptr<const BlockContents> compression_dict_block;
-    Status s =
-        ReadCompressionDictBlock(rep, prefetch_buffer, &compression_dict_block);
-    size_t usage = 0;
-    if (s.ok()) {
-      assert(compression_dict_block != nullptr);
-      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
-      dict = new UncompressionDict(compression_dict_block->data.ToString(),
-                                   rep->blocks_definitely_zstd_compressed,
-                                   rep->ioptions.statistics);
-      usage = dict->ApproximateMemoryUsage();
-      s = rep->table_options.block_cache->Insert(
-          cache_key, dict, usage, &DeleteCachedUncompressionDictEntry,
-          &cache_handle,
-          rep->table_options.cache_index_and_filter_blocks_with_high_priority
-              ? Cache::Priority::HIGH
-              : Cache::Priority::LOW);
-    }
-    if (s.ok()) {
-      PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
-      if (get_context != nullptr) {
-        get_context->get_context_stats_.num_cache_add++;
-        get_context->get_context_stats_.num_cache_bytes_write += usage;
-        get_context->get_context_stats_.num_cache_compression_dict_add++;
-        get_context->get_context_stats_
-            .num_cache_compression_dict_bytes_insert += usage;
-      } else {
-        RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD);
-        RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage);
-        RecordTick(rep->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
-        RecordTick(rep->ioptions.statistics,
-                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage);
-      }
-    } else {
-      // There should be no way to get here if block cache insertion succeeded.
-      // Though it is still possible something failed earlier.
-      RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
-      delete dict;
-      dict = nullptr;
-      assert(cache_handle == nullptr);
-    }
-  }
-  return {dict, cache_handle};
-}
-
-// disable_prefix_seek should be set to true when prefix_extractor found in SST
-// differs from the one in mutable_cf_options and index type is HashBasedIndex
-InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
-    const ReadOptions& read_options, bool disable_prefix_seek,
-    IndexBlockIter* input_iter, CachableEntry<IndexReader>* index_entry,
-    GetContext* get_context) {
-  // index reader has already been pre-populated.
-  if (rep_->index_reader) {
-    // We don't return pinned datat from index blocks, so no need
-    // to set `block_contents_pinned`.
-    return rep_->index_reader->NewIterator(
-        input_iter, read_options.total_order_seek || disable_prefix_seek,
-        read_options.fill_cache);
-  }
-  // we have a pinned index block
-  if (rep_->index_entry.IsSet()) {
-    // We don't return pinned datat from index blocks, so no need
-    // to set `block_contents_pinned`.
-    return rep_->index_entry.value->NewIterator(
-        input_iter, read_options.total_order_seek || disable_prefix_seek,
-        read_options.fill_cache);
-  }
-
-  PERF_TIMER_GUARD(read_index_block_nanos);
-
-  const bool no_io = read_options.read_tier == kBlockCacheTier;
-  Cache* block_cache = rep_->table_options.block_cache.get();
-  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  auto key =
-      GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                            rep_->dummy_index_reader_offset, cache_key);
-  Statistics* statistics = rep_->ioptions.statistics;
-  auto cache_handle = GetEntryFromCache(
-      block_cache, key, rep_->level, BLOCK_CACHE_INDEX_MISS,
-      BLOCK_CACHE_INDEX_HIT,
-      get_context ? &get_context->get_context_stats_.num_cache_index_miss
-                  : nullptr,
-      get_context ? &get_context->get_context_stats_.num_cache_index_hit
-                  : nullptr,
-      statistics, get_context);
-
-  if (cache_handle == nullptr && no_io) {
-    if (input_iter != nullptr) {
-      input_iter->Invalidate(Status::Incomplete("no blocking io"));
-      return input_iter;
-    } else {
-      return NewErrorInternalIterator<BlockHandle>(
-          Status::Incomplete("no blocking io"));
-    }
-  }
-
-  IndexReader* index_reader = nullptr;
-  if (cache_handle != nullptr) {
-    PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
-    index_reader =
-        reinterpret_cast<IndexReader*>(block_cache->Value(cache_handle));
-  } else {
-    // Create index reader and put it in the cache.
-    Status s;
-    TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:2");
-    s = CreateIndexReader(nullptr /* prefetch_buffer */, &index_reader);
-    TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1");
-    TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3");
-    TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4");
-    size_t charge = 0;
-    if (s.ok()) {
-      assert(index_reader != nullptr);
-      charge = index_reader->ApproximateMemoryUsage();
-      s = block_cache->Insert(
-          key, index_reader, charge, &DeleteCachedIndexEntry, &cache_handle,
-          rep_->table_options.cache_index_and_filter_blocks_with_high_priority
-              ? Cache::Priority::HIGH
-              : Cache::Priority::LOW);
-    }
-
-    if (s.ok()) {
-      if (get_context != nullptr) {
-        get_context->get_context_stats_.num_cache_add++;
-        get_context->get_context_stats_.num_cache_bytes_write += charge;
-      } else {
-        RecordTick(statistics, BLOCK_CACHE_ADD);
-        RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
-      }
-      PERF_COUNTER_ADD(index_block_read_count, 1);
-      RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
-      RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
-    } else {
-      if (index_reader != nullptr) {
-        delete index_reader;
-      }
-      RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
-      // make sure if something goes wrong, index_reader shall remain intact.
-      if (input_iter != nullptr) {
-        input_iter->Invalidate(s);
-        return input_iter;
-      } else {
-        return NewErrorInternalIterator<BlockHandle>(s);
-      }
-    }
-  }
-
-  assert(cache_handle);
-  // We don't return pinned datat from index blocks, so no need
-  // to set `block_contents_pinned`.
-  auto* iter = index_reader->NewIterator(
-      input_iter, read_options.total_order_seek || disable_prefix_seek);
-
-  // the caller would like to take ownership of the index block
-  // don't call RegisterCleanup() in this case, the caller will take care of it
-  if (index_entry != nullptr) {
-    *index_entry = {index_reader, cache_handle};
-  } else {
-    iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
-  }
-
-  return iter;
-}
-
-// Convert an index iterator value (i.e., an encoded BlockHandle)
-// into an iterator over the contents of the corresponding block.
-// If input_iter is null, new a iterator
-// If input_iter is not null, update this iter and return it
-template <typename TBlockIter>
-TBlockIter* BlockBasedTable::NewDataBlockIterator(
-    Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
-    TBlockIter* input_iter, bool is_index, bool key_includes_seq,
-    bool index_key_is_full, GetContext* get_context, Status s,
-    FilePrefetchBuffer* prefetch_buffer) {
-  PERF_TIMER_GUARD(new_table_block_iter_nanos);
-
-  Cache* block_cache = rep->table_options.block_cache.get();
-  CachableEntry<Block> block;
-  TBlockIter* iter;
-  {
-    const bool no_io = (ro.read_tier == kBlockCacheTier);
-    auto uncompression_dict_storage =
-        GetUncompressionDict(rep, prefetch_buffer, no_io, get_context);
-    const UncompressionDict& uncompression_dict =
-        uncompression_dict_storage.value == nullptr
-            ? UncompressionDict::GetEmptyDict()
-            : *uncompression_dict_storage.value;
-    if (s.ok()) {
-      s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle,
-                                       uncompression_dict, &block, is_index,
-                                       get_context);
-    }
-
-    if (input_iter != nullptr) {
-      iter = input_iter;
-    } else {
-      iter = new TBlockIter;
-    }
-    // Didn't get any data from block caches.
-    if (s.ok() && block.value == nullptr) {
-      if (no_io) {
-        // Could not read from block_cache and can't do IO
-        iter->Invalidate(Status::Incomplete("no blocking io"));
-        return iter;
-      }
-      std::unique_ptr<Block> block_value;
-      {
-        StopWatch sw(rep->ioptions.env, rep->ioptions.statistics,
-                     READ_BLOCK_GET_MICROS);
-        s = ReadBlockFromFile(
-            rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
-            &block_value, rep->ioptions,
-            rep->blocks_maybe_compressed /*do_decompress*/,
-            rep->blocks_maybe_compressed, uncompression_dict,
-            rep->persistent_cache_options,
-            is_index ? kDisableGlobalSequenceNumber : rep->global_seqno,
-            rep->table_options.read_amp_bytes_per_bit,
-            GetMemoryAllocator(rep->table_options));
-      }
-      if (s.ok()) {
-        block.value = block_value.release();
-      }
-    }
-    // TODO(ajkr): also pin compression dictionary block when
-    // `pin_l0_filter_and_index_blocks_in_cache == true`.
-    uncompression_dict_storage.Release(block_cache);
-  }
-
-  if (s.ok()) {
-    assert(block.value != nullptr);
-    const bool kTotalOrderSeek = true;
-    // Block contents are pinned and it is still pinned after the iterator
-    // is destroyed as long as cleanup functions are moved to another object,
-    // when:
-    // 1. block cache handle is set to be released in cleanup function, or
-    // 2. it's pointing to immortal source. If own_bytes is true then we are
-    //    not reading data from the original source, whether immortal or not.
-    //    Otherwise, the block is pinned iff the source is immortal.
-    bool block_contents_pinned =
-        (block.cache_handle != nullptr ||
-         (!block.value->own_bytes() && rep->immortal_table));
-    iter = block.value->NewIterator<TBlockIter>(
-        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-        iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
-        index_key_is_full, block_contents_pinned);
-    if (block.cache_handle != nullptr) {
-      iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
-                            block.cache_handle);
-    } else {
-      if (!ro.fill_cache && rep->cache_key_prefix_size != 0) {
-        // insert a dummy record to block cache to track the memory usage
-        Cache::Handle* cache_handle;
-        // There are two other types of cache keys: 1) SST cache key added in
-        // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
-        // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
-        // from SST cache key(31 bytes), and use non-zero prefix to
-        // differentiate from `write_buffer_manager`
-        const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
-        char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
-        // Prefix: use rep->cache_key_prefix padded by 0s
-        memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
-        assert(rep->cache_key_prefix_size != 0);
-        assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix);
-        memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size);
-        char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
-                                   next_cache_key_id_++);
-        assert(end - cache_key <=
-               static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
-        Slice unique_key =
-            Slice(cache_key, static_cast<size_t>(end - cache_key));
-        s = block_cache->Insert(unique_key, nullptr,
-                                block.value->ApproximateMemoryUsage(), nullptr,
-                                &cache_handle);
-        if (s.ok()) {
-          if (cache_handle != nullptr) {
-            iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
-                                  cache_handle);
-          }
-        }
-      }
-      iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
-    }
-  } else {
-    assert(block.value == nullptr);
-    iter->Invalidate(s);
-  }
-  return iter;
-}
-
-Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
-    FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
-    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) {
-  assert(block_entry != nullptr);
-  const bool no_io = (ro.read_tier == kBlockCacheTier);
-  Cache* block_cache = rep->table_options.block_cache.get();
-
-  // No point to cache compressed blocks if it never goes away
-  Cache* block_cache_compressed =
-      rep->immortal_table ? nullptr
-                          : rep->table_options.block_cache_compressed.get();
-
-  // First, try to get the block from the cache
-  //
-  // If either block cache is enabled, we'll try to read from it.
-  Status s;
-  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  Slice key /* key to the block cache */;
-  Slice ckey /* key to the compressed block cache */;
-  if (block_cache != nullptr || block_cache_compressed != nullptr) {
-    // create key for block cache
-    if (block_cache != nullptr) {
-      key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
-                        handle, cache_key);
-    }
-
-    if (block_cache_compressed != nullptr) {
-      ckey = GetCacheKey(rep->compressed_cache_key_prefix,
-                         rep->compressed_cache_key_prefix_size, handle,
-                         compressed_cache_key);
-    }
-
-    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
-                              rep, ro, block_entry, uncompression_dict,
-                              rep->table_options.read_amp_bytes_per_bit,
-                              is_index, get_context);
-
-    // Can't find the block from the cache. If I/O is allowed, read from the
-    // file.
-    if (block_entry->value == nullptr && !no_io && ro.fill_cache) {
-      Statistics* statistics = rep->ioptions.statistics;
-      bool do_decompress =
-          block_cache_compressed == nullptr && rep->blocks_maybe_compressed;
-      CompressionType raw_block_comp_type;
-      BlockContents raw_block_contents;
-      {
-        StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
-        BlockFetcher block_fetcher(
-            rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
-            &raw_block_contents, rep->ioptions,
-            do_decompress /* do uncompress */, rep->blocks_maybe_compressed,
-            uncompression_dict, rep->persistent_cache_options,
-            GetMemoryAllocator(rep->table_options),
-            GetMemoryAllocatorForCompressedBlock(rep->table_options));
-        s = block_fetcher.ReadBlockContents();
-        raw_block_comp_type = block_fetcher.get_compression_type();
-      }
-
-      if (s.ok()) {
-        SequenceNumber seq_no = rep->get_global_seqno(is_index);
-        // If filling cache is allowed and a cache is configured, try to put the
-        // block to the cache.
-        s = PutDataBlockToCache(
-            key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions,
-            block_entry, &raw_block_contents, raw_block_comp_type,
-            rep->table_options.format_version, uncompression_dict, seq_no,
-            rep->table_options.read_amp_bytes_per_bit,
-            GetMemoryAllocator(rep->table_options), is_index,
-            is_index && rep->table_options
-                            .cache_index_and_filter_blocks_with_high_priority
-                ? Cache::Priority::HIGH
-                : Cache::Priority::LOW,
-            get_context);
-      }
-    }
-  }
-  assert(s.ok() || block_entry->value == nullptr);
-  return s;
-}
-
-BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
-    BlockBasedTable* table,
-    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
-    bool index_key_includes_seq, bool index_key_is_full)
-    : table_(table),
-      block_map_(block_map),
-      index_key_includes_seq_(index_key_includes_seq),
-      index_key_is_full_(index_key_is_full) {}
-
-InternalIteratorBase<BlockHandle>*
-BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
-    const BlockHandle& handle) {
-  // Return a block iterator on the index partition
-  auto rep = table_->get_rep();
-  auto block = block_map_->find(handle.offset());
-  // This is a possible scenario since block cache might not have had space
-  // for the partition
-  if (block != block_map_->end()) {
-    PERF_COUNTER_ADD(block_cache_hit_count, 1);
-    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT);
-    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT);
-    Cache* block_cache = rep->table_options.block_cache.get();
-    assert(block_cache);
-    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
-               block_cache->GetUsage(block->second.cache_handle));
-    Statistics* kNullStats = nullptr;
-    // We don't return pinned datat from index blocks, so no need
-    // to set `block_contents_pinned`.
-    return block->second.value->NewIterator<IndexBlockIter>(
-        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
-        nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
-  }
-  // Create an empty iterator
-  return new IndexBlockIter();
-}
-
-// This will be broken if the user specifies an unusual implementation
-// of Options.comparator, or if the user specifies an unusual
-// definition of prefixes in BlockBasedTableOptions.filter_policy.
-// In particular, we require the following three properties:
-//
-// 1) key.starts_with(prefix(key))
-// 2) Compare(prefix(key), key) <= 0.
-// 3) If Compare(key1, key2) <= 0, then Compare(prefix(key1), prefix(key2)) <= 0
-//
-// Otherwise, this method guarantees no I/O will be incurred.
-//
-// REQUIRES: this method shouldn't be called while the DB lock is held.
-bool BlockBasedTable::PrefixMayMatch(
-    const Slice& internal_key, const ReadOptions& read_options,
-    const SliceTransform* options_prefix_extractor,
-    const bool need_upper_bound_check) {
-  if (!rep_->filter_policy) {
-    return true;
-  }
-
-  const SliceTransform* prefix_extractor;
-
-  if (rep_->table_prefix_extractor == nullptr) {
-    if (need_upper_bound_check) {
-      return true;
-    }
-    prefix_extractor = options_prefix_extractor;
-  } else {
-    prefix_extractor = rep_->table_prefix_extractor.get();
-  }
-  auto user_key = ExtractUserKey(internal_key);
-  if (!prefix_extractor->InDomain(user_key)) {
-    return true;
-  }
-
-  bool may_match = true;
-  Status s;
-
-  // First, try check with full filter
-  auto filter_entry = GetFilter(prefix_extractor);
-  FilterBlockReader* filter = filter_entry.value;
-  bool filter_checked = true;
-  if (filter != nullptr) {
-    if (!filter->IsBlockBased()) {
-      const Slice* const const_ikey_ptr = &internal_key;
-      may_match = filter->RangeMayExist(
-          read_options.iterate_upper_bound, user_key, prefix_extractor,
-          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
-          &filter_checked, need_upper_bound_check);
-    } else {
-      // if prefix_extractor changed for block based filter, skip filter
-      if (need_upper_bound_check) {
-        if (!rep_->filter_entry.IsSet()) {
-          filter_entry.Release(rep_->table_options.block_cache.get());
-        }
-        return true;
-      }
-      auto prefix = prefix_extractor->Transform(user_key);
-      InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
-      auto internal_prefix = internal_key_prefix.Encode();
-
-      // To prevent any io operation in this method, we set `read_tier` to make
-      // sure we always read index or filter only when they have already been
-      // loaded to memory.
-      ReadOptions no_io_read_options;
-      no_io_read_options.read_tier = kBlockCacheTier;
-
-      // Then, try find it within each block
-      // we already know prefix_extractor and prefix_extractor_name must match
-      // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
-      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
-          NewIndexIterator(no_io_read_options,
-                           /* need_upper_bound_check */ false));
-      iiter->Seek(internal_prefix);
-
-      if (!iiter->Valid()) {
-        // we're past end of file
-        // if it's incomplete, it means that we avoided I/O
-        // and we're not really sure that we're past the end
-        // of the file
-        may_match = iiter->status().IsIncomplete();
-      } else if ((rep_->table_properties &&
-                          rep_->table_properties->index_key_is_user_key
-                      ? iiter->key()
-                      : ExtractUserKey(iiter->key()))
-                     .starts_with(ExtractUserKey(internal_prefix))) {
-        // we need to check for this subtle case because our only
-        // guarantee is that "the key is a string >= last key in that data
-        // block" according to the doc/table_format.txt spec.
-        //
-        // Suppose iiter->key() starts with the desired prefix; it is not
-        // necessarily the case that the corresponding data block will
-        // contain the prefix, since iiter->key() need not be in the
-        // block.  However, the next data block may contain the prefix, so
-        // we return true to play it safe.
-        may_match = true;
-      } else if (filter->IsBlockBased()) {
-        // iiter->key() does NOT start with the desired prefix.  Because
-        // Seek() finds the first key that is >= the seek target, this
-        // means that iiter->key() > prefix.  Thus, any data blocks coming
-        // after the data block corresponding to iiter->key() cannot
-        // possibly contain the key.  Thus, the corresponding data block
-        // is the only on could potentially contain the prefix.
-        BlockHandle handle = iiter->value();
-        may_match =
-            filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset());
-      }
-    }
-  }
-
-  if (filter_checked) {
-    Statistics* statistics = rep_->ioptions.statistics;
-    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
-    if (!may_match) {
-      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
-    }
-  }
-
-  // if rep_->filter_entry is not set, we should call Release(); otherwise
-  // don't call, in this case we have a local copy in rep_->filter_entry,
-  // it's pinned to the cache and will be released in the destructor
-  if (!rep_->filter_entry.IsSet()) {
-    filter_entry.Release(rep_->table_options.block_cache.get());
-  }
-  return may_match;
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
-  is_out_of_bound_ = false;
-  if (!CheckPrefixMayMatch(target)) {
-    ResetDataIter();
-    return;
-  }
-
-  SavePrevIndexValue();
-
-  index_iter_->Seek(target);
-
-  if (!index_iter_->Valid()) {
-    ResetDataIter();
-    return;
-  }
-
-  InitDataBlock();
-
-  block_iter_.Seek(target);
-
-  FindKeyForward();
-  CheckOutOfBound();
-  assert(
-      !block_iter_.Valid() ||
-      (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) ||
-      (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target),
-                                                      block_iter_.key()) <= 0));
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
-    const Slice& target) {
-  is_out_of_bound_ = false;
-  if (!CheckPrefixMayMatch(target)) {
-    ResetDataIter();
-    return;
-  }
-
-  SavePrevIndexValue();
-
-  // Call Seek() rather than SeekForPrev() in the index block, because the
-  // target data block will likely to contain the position for `target`, the
-  // same as Seek(), rather than than before.
-  // For example, if we have three data blocks, each containing two keys:
-  //   [2, 4]  [6, 8] [10, 12]
-  //  (the keys in the index block would be [4, 8, 12])
-  // and the user calls SeekForPrev(7), we need to go to the second block,
-  // just like if they call Seek(7).
-  // The only case where the block is difference is when they seek to a position
-  // in the boundary. For example, if they SeekForPrev(5), we should go to the
-  // first block, rather than the second. However, we don't have the information
-  // to distinguish the two unless we read the second block. In this case, we'll
-  // end up with reading two blocks.
-  index_iter_->Seek(target);
-
-  if (!index_iter_->Valid()) {
-    index_iter_->SeekToLast();
-    if (!index_iter_->Valid()) {
-      ResetDataIter();
-      block_iter_points_to_real_block_ = false;
-      return;
-    }
-  }
-
-  InitDataBlock();
-
-  block_iter_.SeekForPrev(target);
-
-  FindKeyBackward();
-  assert(!block_iter_.Valid() ||
-         icomp_.Compare(target, block_iter_.key()) >= 0);
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
-  is_out_of_bound_ = false;
-  SavePrevIndexValue();
-  index_iter_->SeekToFirst();
-  if (!index_iter_->Valid()) {
-    ResetDataIter();
-    return;
-  }
-  InitDataBlock();
-  block_iter_.SeekToFirst();
-  FindKeyForward();
-  CheckOutOfBound();
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
-  is_out_of_bound_ = false;
-  SavePrevIndexValue();
-  index_iter_->SeekToLast();
-  if (!index_iter_->Valid()) {
-    ResetDataIter();
-    return;
-  }
-  InitDataBlock();
-  block_iter_.SeekToLast();
-  FindKeyBackward();
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
-  assert(block_iter_points_to_real_block_);
-  block_iter_.Next();
-  FindKeyForward();
-}
-
-template <class TBlockIter, typename TValue>
-bool BlockBasedTableIterator<TBlockIter, TValue>::NextAndGetResult(
-    Slice* ret_key) {
-  Next();
-  bool is_valid = Valid();
-  if (is_valid) {
-    *ret_key = key();
-  }
-  return is_valid;
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
-  assert(block_iter_points_to_real_block_);
-  block_iter_.Prev();
-  FindKeyBackward();
-}
-
-// Found that 256 KB readahead size provides the best performance, based on
-// experiments, for auto readahead. Experiment data is in PR #3282.
-template <class TBlockIter, typename TValue>
-const size_t
-    BlockBasedTableIterator<TBlockIter, TValue>::kMaxAutoReadaheadSize =
-        256 * 1024;
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
-  BlockHandle data_block_handle = index_iter_->value();
-  if (!block_iter_points_to_real_block_ ||
-      data_block_handle.offset() != prev_index_value_.offset() ||
-      // if previous attempt of reading the block missed cache, try again
-      block_iter_.status().IsIncomplete()) {
-    if (block_iter_points_to_real_block_) {
-      ResetDataIter();
-    }
-    auto* rep = table_->get_rep();
-
-    // Prefetch additional data for range scans (iterators). Enabled only for
-    // user reads.
-    // Implicit auto readahead:
-    //   Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0.
-    // Explicit user requested readahead:
-    //   Enabled from the very first IO when ReadOptions.readahead_size is set.
-    if (!for_compaction_) {
-      if (read_options_.readahead_size == 0) {
-        // Implicit auto readahead
-        num_file_reads_++;
-        if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) {
-          if (!rep->file->use_direct_io() &&
-              (data_block_handle.offset() +
-                   static_cast<size_t>(data_block_handle.size()) +
-                   kBlockTrailerSize >
-               readahead_limit_)) {
-            // Buffered I/O
-            // Discarding the return status of Prefetch calls intentionally, as
-            // we can fallback to reading from disk if Prefetch fails.
-            rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
-            readahead_limit_ = static_cast<size_t>(data_block_handle.offset() +
-                                                   readahead_size_);
-            // Keep exponentially increasing readahead size until
-            // kMaxAutoReadaheadSize.
-            readahead_size_ =
-                std::min(kMaxAutoReadaheadSize, readahead_size_ * 2);
-          } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
-            // Direct I/O
-            // Let FilePrefetchBuffer take care of the readahead.
-            prefetch_buffer_.reset(
-                new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize,
-                                       kMaxAutoReadaheadSize));
-          }
-        }
-      } else if (!prefetch_buffer_) {
-        // Explicit user requested readahead
-        // The actual condition is:
-        // if (read_options_.readahead_size != 0 && !prefetch_buffer_)
-        prefetch_buffer_.reset(new FilePrefetchBuffer(
-            rep->file.get(), read_options_.readahead_size,
-            read_options_.readahead_size));
-      }
-    }
-
-    Status s;
-    BlockBasedTable::NewDataBlockIterator<TBlockIter>(
-        rep, read_options_, data_block_handle, &block_iter_, is_index_,
-        key_includes_seq_, index_key_is_full_,
-        /* get_context */ nullptr, s, prefetch_buffer_.get());
-    block_iter_points_to_real_block_ = true;
-  }
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::FindBlockForward() {
-  // TODO the while loop inherits from two-level-iterator. We don't know
-  // whether a block can be empty so it can be replaced by an "if".
-  do {
-    if (!block_iter_.status().ok()) {
-      return;
-    }
-    // Whether next data block is out of upper bound, if there is one.
-    bool next_block_is_out_of_bound = false;
-    if (read_options_.iterate_upper_bound != nullptr &&
-        block_iter_points_to_real_block_) {
-      next_block_is_out_of_bound =
-          (user_comparator_.Compare(*read_options_.iterate_upper_bound,
-                                    index_iter_->user_key()) <= 0);
-    }
-    ResetDataIter();
-    index_iter_->Next();
-    if (next_block_is_out_of_bound) {
-      // The next block is out of bound. No need to read it.
-      TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr);
-      // We need to make sure this is not the last data block before setting
-      // is_out_of_bound_, since the index key for the last data block can be
-      // larger than smallest key of the next file on the same level.
-      if (index_iter_->Valid()) {
-        is_out_of_bound_ = true;
-      }
-      return;
-    }
-
-    if (index_iter_->Valid()) {
-      InitDataBlock();
-      block_iter_.SeekToFirst();
-    } else {
-      return;
-    }
-  } while (!block_iter_.Valid());
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
-  assert(!is_out_of_bound_);
-
-  if (!block_iter_.Valid()) {
-    FindBlockForward();
-  }
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
-  while (!block_iter_.Valid()) {
-    if (!block_iter_.status().ok()) {
-      return;
-    }
-
-    ResetDataIter();
-    index_iter_->Prev();
-
-    if (index_iter_->Valid()) {
-      InitDataBlock();
-      block_iter_.SeekToLast();
-    } else {
-      return;
-    }
-  }
-
-  // We could have check lower bound here too, but we opt not to do it for
-  // code simplicity.
-}
-
-template <class TBlockIter, typename TValue>
-void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
-  if (read_options_.iterate_upper_bound != nullptr &&
-      block_iter_points_to_real_block_ && block_iter_.Valid()) {
-    is_out_of_bound_ = user_comparator_.Compare(
-                           *read_options_.iterate_upper_bound, user_key()) <= 0;
-  }
-}
-
-InternalIterator* BlockBasedTable::NewIterator(
-    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
-    Arena* arena, bool skip_filters, bool for_compaction) {
-  bool need_upper_bound_check =
-      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
-  const bool kIsNotIndex = false;
-  if (arena == nullptr) {
-    return new BlockBasedTableIterator<DataBlockIter>(
-        this, read_options, rep_->internal_comparator,
-        NewIndexIterator(
-            read_options,
-            need_upper_bound_check &&
-                rep_->index_type == BlockBasedTableOptions::kHashSearch),
-        !skip_filters && !read_options.total_order_seek &&
-            prefix_extractor != nullptr,
-        need_upper_bound_check, prefix_extractor, kIsNotIndex,
-        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
-  } else {
-    auto* mem =
-        arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
-    return new (mem) BlockBasedTableIterator<DataBlockIter>(
-        this, read_options, rep_->internal_comparator,
-        NewIndexIterator(read_options, need_upper_bound_check),
-        !skip_filters && !read_options.total_order_seek &&
-            prefix_extractor != nullptr,
-        need_upper_bound_check, prefix_extractor, kIsNotIndex,
-        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
-  }
-}
-
-FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
-    const ReadOptions& read_options) {
-  if (rep_->fragmented_range_dels == nullptr) {
-    return nullptr;
-  }
-  SequenceNumber snapshot = kMaxSequenceNumber;
-  if (read_options.snapshot != nullptr) {
-    snapshot = read_options.snapshot->GetSequenceNumber();
-  }
-  return new FragmentedRangeTombstoneIterator(
-      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
-}
-
-bool BlockBasedTable::FullFilterKeyMayMatch(
-    const ReadOptions& read_options, FilterBlockReader* filter,
-    const Slice& internal_key, const bool no_io,
-    const SliceTransform* prefix_extractor) const {
-  if (filter == nullptr || filter->IsBlockBased()) {
-    return true;
-  }
-  Slice user_key = ExtractUserKey(internal_key);
-  const Slice* const const_ikey_ptr = &internal_key;
-  bool may_match = true;
-  if (filter->whole_key_filtering()) {
-    may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid,
-                                    no_io, const_ikey_ptr);
-  } else if (!read_options.total_order_seek && prefix_extractor &&
-             rep_->table_properties->prefix_extractor_name.compare(
-                 prefix_extractor->Name()) == 0 &&
-             prefix_extractor->InDomain(user_key) &&
-             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
-                                     prefix_extractor, kNotValid, false,
-                                     const_ikey_ptr)) {
-    may_match = false;
-  }
-  if (may_match) {
-    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
-    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
-  }
-  return may_match;
-}
-
-void BlockBasedTable::FullFilterKeysMayMatch(
-    const ReadOptions& read_options, FilterBlockReader* filter,
-    MultiGetRange* range, const bool no_io,
-    const SliceTransform* prefix_extractor) const {
-  if (filter == nullptr || filter->IsBlockBased()) {
-    return;
-  }
-  if (filter->whole_key_filtering()) {
-    filter->KeysMayMatch(range, prefix_extractor, kNotValid, no_io);
-  } else if (!read_options.total_order_seek && prefix_extractor &&
-             rep_->table_properties->prefix_extractor_name.compare(
-                 prefix_extractor->Name()) == 0) {
-    for (auto iter = range->begin(); iter != range->end(); ++iter) {
-      Slice user_key = iter->lkey->user_key();
-
-      if (!prefix_extractor->InDomain(user_key)) {
-        range->SkipKey(iter);
-      }
-    }
-    filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false);
-  }
-}
-
-Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
-                            GetContext* get_context,
-                            const SliceTransform* prefix_extractor,
-                            bool skip_filters) {
-  assert(key.size() >= 8);  // key must be internal key
-  Status s;
-  const bool no_io = read_options.read_tier == kBlockCacheTier;
-  CachableEntry<FilterBlockReader> filter_entry;
-  bool may_match;
-  FilterBlockReader* filter = nullptr;
-  {
-    if (!skip_filters) {
-      filter_entry =
-          GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr,
-                    read_options.read_tier == kBlockCacheTier, get_context);
-    }
-    filter = filter_entry.value;
-
-    // First check the full filter
-    // If full filter not useful, Then go into each block
-    may_match = FullFilterKeyMayMatch(read_options, filter, key, no_io,
-                                      prefix_extractor);
-  }
-  if (!may_match) {
-    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
-    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
-  } else {
-    IndexBlockIter iiter_on_stack;
-    // if prefix_extractor found in block differs from options, disable
-    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
-    bool need_upper_bound_check = false;
-    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
-      need_upper_bound_check = PrefixExtractorChanged(
-          rep_->table_properties.get(), prefix_extractor);
-    }
-    auto iiter =
-        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
-                         /* index_entry */ nullptr, get_context);
-    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
-    if (iiter != &iiter_on_stack) {
-      iiter_unique_ptr.reset(iiter);
-    }
-
-    bool matched = false;  // if such user key mathced a key in SST
-    bool done = false;
-    for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
-      BlockHandle handle = iiter->value();
-
-      bool not_exist_in_filter =
-          filter != nullptr && filter->IsBlockBased() == true &&
-          !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor,
-                               handle.offset(), no_io);
-
-      if (not_exist_in_filter) {
-        // Not found
-        // TODO: think about interaction with Merge. If a user key cannot
-        // cross one data block, we should be fine.
-        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
-        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
-        break;
-      } else {
-        DataBlockIter biter;
-        NewDataBlockIterator<DataBlockIter>(
-            rep_, read_options, iiter->value(), &biter, false,
-            true /* key_includes_seq */, true /* index_key_is_full */,
-            get_context);
-
-        if (read_options.read_tier == kBlockCacheTier &&
-            biter.status().IsIncomplete()) {
-          // couldn't get block from block_cache
-          // Update Saver.state to Found because we are only looking for
-          // whether we can guarantee the key is not there when "no_io" is set
-          get_context->MarkKeyMayExist();
-          break;
-        }
-        if (!biter.status().ok()) {
-          s = biter.status();
-          break;
-        }
-
-        bool may_exist = biter.SeekForGet(key);
-        if (!may_exist) {
-          // HashSeek cannot find the key this block and the the iter is not
-          // the end of the block, i.e. cannot be in the following blocks
-          // either. In this case, the seek_key cannot be found, so we break
-          // from the top level for-loop.
-          break;
-        }
-
-        // Call the *saver function on each entry/block until it returns false
-        for (; biter.Valid(); biter.Next()) {
-          ParsedInternalKey parsed_key;
-          if (!ParseInternalKey(biter.key(), &parsed_key)) {
-            s = Status::Corruption(Slice());
-          }
-
-          if (!get_context->SaveValue(
-                  parsed_key, biter.value(), &matched,
-                  biter.IsValuePinned() ? &biter : nullptr)) {
-            done = true;
-            break;
-          }
-        }
-        s = biter.status();
-      }
-      if (done) {
-        // Avoid the extra Next which is expensive in two-level indexes
-        break;
-      }
-    }
-    if (matched && filter != nullptr && !filter->IsBlockBased()) {
-      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
-      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
-                                rep_->level);
-    }
-    if (s.ok()) {
-      s = iiter->status();
-    }
-  }
-
-  // if rep_->filter_entry is not set, we should call Release(); otherwise
-  // don't call, in this case we have a local copy in rep_->filter_entry,
-  // it's pinned to the cache and will be released in the destructor
-  if (!rep_->filter_entry.IsSet()) {
-    filter_entry.Release(rep_->table_options.block_cache.get());
-  }
-  return s;
-}
-
-using MultiGetRange = MultiGetContext::Range;
-void BlockBasedTable::MultiGet(const ReadOptions& read_options,
-                               const MultiGetRange* mget_range,
-                               const SliceTransform* prefix_extractor,
-                               bool skip_filters) {
-  const bool no_io = read_options.read_tier == kBlockCacheTier;
-  CachableEntry<FilterBlockReader> filter_entry;
-  FilterBlockReader* filter = nullptr;
-  MultiGetRange sst_file_range(*mget_range, mget_range->begin(),
-                               mget_range->end());
-  {
-    if (!skip_filters) {
-      // TODO: Figure out where the stats should go
-      filter_entry = GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr,
-                               read_options.read_tier == kBlockCacheTier,
-                               nullptr /*get_context*/);
-    }
-    filter = filter_entry.value;
-
-    // First check the full filter
-    // If full filter not useful, Then go into each block
-    FullFilterKeysMayMatch(read_options, filter, &sst_file_range, no_io,
-                           prefix_extractor);
-  }
-  if (skip_filters || !sst_file_range.empty()) {
-    IndexBlockIter iiter_on_stack;
-    // if prefix_extractor found in block differs from options, disable
-    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
-    bool need_upper_bound_check = false;
-    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
-      need_upper_bound_check = PrefixExtractorChanged(
-          rep_->table_properties.get(), prefix_extractor);
-    }
-    auto iiter = NewIndexIterator(
-        read_options, need_upper_bound_check, &iiter_on_stack,
-        /* index_entry */ nullptr, sst_file_range.begin()->get_context);
-    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
-    if (iiter != &iiter_on_stack) {
-      iiter_unique_ptr.reset(iiter);
-    }
-
-    for (auto miter = sst_file_range.begin(); miter != sst_file_range.end();
-         ++miter) {
-      Status s;
-      GetContext* get_context = miter->get_context;
-      const Slice& key = miter->ikey;
-      bool matched = false;  // if such user key matched a key in SST
-      bool done = false;
-      for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
-        DataBlockIter biter;
-        NewDataBlockIterator<DataBlockIter>(
-            rep_, read_options, iiter->value(), &biter, false,
-            true /* key_includes_seq */, get_context);
-
-        if (read_options.read_tier == kBlockCacheTier &&
-            biter.status().IsIncomplete()) {
-          // couldn't get block from block_cache
-          // Update Saver.state to Found because we are only looking for
-          // whether we can guarantee the key is not there when "no_io" is set
-          get_context->MarkKeyMayExist();
-          break;
-        }
-        if (!biter.status().ok()) {
-          s = biter.status();
-          break;
-        }
-
-        bool may_exist = biter.SeekForGet(key);
-        if (!may_exist) {
-          // HashSeek cannot find the key this block and the the iter is not
-          // the end of the block, i.e. cannot be in the following blocks
-          // either. In this case, the seek_key cannot be found, so we break
-          // from the top level for-loop.
-          break;
-        }
-
-        // Call the *saver function on each entry/block until it returns false
-        for (; biter.Valid(); biter.Next()) {
-          ParsedInternalKey parsed_key;
-          if (!ParseInternalKey(biter.key(), &parsed_key)) {
-            s = Status::Corruption(Slice());
-          }
-
-          if (!get_context->SaveValue(
-                  parsed_key, biter.value(), &matched,
-                  biter.IsValuePinned() ? &biter : nullptr)) {
-            done = true;
-            break;
-          }
-        }
-        s = biter.status();
-        if (done) {
-          // Avoid the extra Next which is expensive in two-level indexes
-          break;
-        }
-      }
-      if (matched && filter != nullptr && !filter->IsBlockBased()) {
-        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
-        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
-                                  rep_->level);
-      }
-      if (s.ok()) {
-        s = iiter->status();
-      }
-      *(miter->s) = s;
-    }
-  }
-
-  // if rep_->filter_entry is not set, we should call Release(); otherwise
-  // don't call, in this case we have a local copy in rep_->filter_entry,
-  // it's pinned to the cache and will be released in the destructor
-  if (!rep_->filter_entry.IsSet()) {
-    filter_entry.Release(rep_->table_options.block_cache.get());
-  }
-}
-
-Status BlockBasedTable::Prefetch(const Slice* const begin,
-                                 const Slice* const end) {
-  auto& comparator = rep_->internal_comparator;
-  auto user_comparator = comparator.user_comparator();
-  // pre-condition
-  if (begin && end && comparator.Compare(*begin, *end) > 0) {
-    return Status::InvalidArgument(*begin, *end);
-  }
-
-  IndexBlockIter iiter_on_stack;
-  auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
-  if (iiter != &iiter_on_stack) {
-    iiter_unique_ptr =
-        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
-  }
-
-  if (!iiter->status().ok()) {
-    // error opening index iterator
-    return iiter->status();
-  }
-
-  // indicates if we are on the last page that need to be pre-fetched
-  bool prefetching_boundary_page = false;
-
-  for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
-       iiter->Next()) {
-    BlockHandle block_handle = iiter->value();
-    const bool is_user_key = rep_->table_properties &&
-                             rep_->table_properties->index_key_is_user_key > 0;
-    if (end &&
-        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
-         (is_user_key &&
-          user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
-      if (prefetching_boundary_page) {
-        break;
-      }
-
-      // The index entry represents the last key in the data block.
-      // We should load this page into memory as well, but no more
-      prefetching_boundary_page = true;
-    }
-
-    // Load the block specified by the block_handle into the block cache
-    DataBlockIter biter;
-    NewDataBlockIterator<DataBlockIter>(rep_, ReadOptions(), block_handle,
-                                        &biter);
-
-    if (!biter.status().ok()) {
-      // there was an unexpected error while pre-fetching
-      return biter.status();
-    }
-  }
-
-  return Status::OK();
-}
-
-Status BlockBasedTable::VerifyChecksum() {
-  Status s;
-  // Check Meta blocks
-  std::unique_ptr<Block> meta;
-  std::unique_ptr<InternalIterator> meta_iter;
-  s = ReadMetaBlock(rep_, nullptr /* prefetch buffer */, &meta, &meta_iter);
-  if (s.ok()) {
-    s = VerifyChecksumInMetaBlocks(meta_iter.get());
-    if (!s.ok()) {
-      return s;
-    }
-  } else {
-    return s;
-  }
-  // Check Data blocks
-  IndexBlockIter iiter_on_stack;
-  InternalIteratorBase<BlockHandle>* iiter =
-      NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
-  if (iiter != &iiter_on_stack) {
-    iiter_unique_ptr =
-        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
-  }
-  if (!iiter->status().ok()) {
-    // error opening index iterator
-    return iiter->status();
-  }
-  s = VerifyChecksumInBlocks(iiter);
-  return s;
-}
-
-Status BlockBasedTable::VerifyChecksumInBlocks(
-    InternalIteratorBase<BlockHandle>* index_iter) {
-  Status s;
-  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
-    s = index_iter->status();
-    if (!s.ok()) {
-      break;
-    }
-    BlockHandle handle = index_iter->value();
-    BlockContents contents;
-    BlockFetcher block_fetcher(
-        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
-        ReadOptions(), handle, &contents, rep_->ioptions,
-        false /* decompress */, false /*maybe_compressed*/,
-        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
-    s = block_fetcher.ReadBlockContents();
-    if (!s.ok()) {
-      break;
-    }
-  }
-  return s;
-}
-
-Status BlockBasedTable::VerifyChecksumInMetaBlocks(
-    InternalIteratorBase<Slice>* index_iter) {
-  Status s;
-  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
-    s = index_iter->status();
-    if (!s.ok()) {
-      break;
-    }
-    BlockHandle handle;
-    Slice input = index_iter->value();
-    s = handle.DecodeFrom(&input);
-    BlockContents contents;
-    BlockFetcher block_fetcher(
-        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
-        ReadOptions(), handle, &contents, rep_->ioptions,
-        false /* decompress */, false /*maybe_compressed*/,
-        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
-    s = block_fetcher.ReadBlockContents();
-    if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) {
-      TableProperties* table_properties;
-      s = TryReadPropertiesWithGlobalSeqno(rep_, nullptr /* prefetch_buffer */,
-                                           index_iter->value(),
-                                           &table_properties);
-      delete table_properties;
-    }
-    if (!s.ok()) {
-      break;
-    }
-  }
-  return s;
-}
-
-bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
-                                      const Slice& key) {
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
-      NewIndexIterator(options));
-  iiter->Seek(key);
-  assert(iiter->Valid());
-  CachableEntry<Block> block;
-
-  BlockHandle handle = iiter->value();
-  Cache* block_cache = rep_->table_options.block_cache.get();
-  assert(block_cache != nullptr);
-
-  char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  Slice cache_key =
-      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
-                  cache_key_storage);
-  Slice ckey;
-
-  Status s;
-  if (!rep_->compression_dict_handle.IsNull()) {
-    std::unique_ptr<const BlockContents> compression_dict_block;
-    s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */,
-                                 &compression_dict_block);
-    if (s.ok()) {
-      assert(compression_dict_block != nullptr);
-      UncompressionDict uncompression_dict(
-          compression_dict_block->data.ToString(),
-          rep_->blocks_definitely_zstd_compressed);
-      s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, rep_,
-                                options, &block, uncompression_dict,
-                                0 /* read_amp_bytes_per_bit */);
-    }
-  } else {
-    s = GetDataBlockFromCache(
-        cache_key, ckey, block_cache, nullptr, rep_, options, &block,
-        UncompressionDict::GetEmptyDict(), 0 /* read_amp_bytes_per_bit */);
-  }
-  assert(s.ok());
-  bool in_cache = block.value != nullptr;
-  if (in_cache) {
-    ReleaseCachedEntry(block_cache, block.cache_handle);
-  }
-  return in_cache;
-}
-
-BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
-  // Some old version of block-based tables don't have index type present in
-  // table properties. If that's the case we can safely use the kBinarySearch.
-  BlockBasedTableOptions::IndexType index_type_on_file =
-      BlockBasedTableOptions::kBinarySearch;
-  if (rep_->table_properties) {
-    auto& props = rep_->table_properties->user_collected_properties;
-    auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
-    if (pos != props.end()) {
-      index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
-          DecodeFixed32(pos->second.c_str()));
-      // update index_type with the true type
-      rep_->index_type = index_type_on_file;
-    }
-  }
-  return index_type_on_file;
-}
-
-// REQUIRES: The following fields of rep_ should have already been populated:
-//  1. file
-//  2. index_handle,
-//  3. options
-//  4. internal_comparator
-//  5. index_type
-Status BlockBasedTable::CreateIndexReader(
-    FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
-    InternalIterator* preloaded_meta_index_iter, int level) {
-  auto index_type_on_file = UpdateIndexType();
-
-  auto file = rep_->file.get();
-  const InternalKeyComparator* icomparator = &rep_->internal_comparator;
-  const Footer& footer = rep_->footer;
-
-  // kHashSearch requires non-empty prefix_extractor but bypass checking
-  // prefix_extractor here since we have no access to MutableCFOptions.
-  // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
-  // If prefix_extractor does not match prefix_extractor_name from table
-  // properties, turn off Hash Index by setting total_order_seek to true
-
-  switch (index_type_on_file) {
-    case BlockBasedTableOptions::kTwoLevelIndexSearch: {
-      return PartitionIndexReader::Create(
-          this, file, prefetch_buffer, footer, footer.index_handle(),
-          rep_->ioptions, icomparator, index_reader,
-          rep_->persistent_cache_options, level,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0,
-          GetMemoryAllocator(rep_->table_options));
-    }
-    case BlockBasedTableOptions::kBinarySearch: {
-      return BinarySearchIndexReader::Create(
-          file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions,
-          icomparator, index_reader, rep_->persistent_cache_options,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0,
-          GetMemoryAllocator(rep_->table_options));
-    }
-    case BlockBasedTableOptions::kHashSearch: {
-      std::unique_ptr<Block> meta_guard;
-      std::unique_ptr<InternalIterator> meta_iter_guard;
-      auto meta_index_iter = preloaded_meta_index_iter;
-      if (meta_index_iter == nullptr) {
-        auto s =
-            ReadMetaBlock(rep_, prefetch_buffer, &meta_guard, &meta_iter_guard);
-        if (!s.ok()) {
-          // we simply fall back to binary search in case there is any
-          // problem with prefix hash index loading.
-          ROCKS_LOG_WARN(rep_->ioptions.info_log,
-                         "Unable to read the metaindex block."
-                         " Fall back to binary search index.");
-          return BinarySearchIndexReader::Create(
-              file, prefetch_buffer, footer, footer.index_handle(),
-              rep_->ioptions, icomparator, index_reader,
-              rep_->persistent_cache_options,
-              rep_->table_properties == nullptr ||
-                  rep_->table_properties->index_key_is_user_key == 0,
-              rep_->table_properties == nullptr ||
-                  rep_->table_properties->index_value_is_delta_encoded == 0,
-              GetMemoryAllocator(rep_->table_options));
-        }
-        meta_index_iter = meta_iter_guard.get();
-      }
-
-      return HashIndexReader::Create(
-          rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer,
-          rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter,
-          index_reader, rep_->hash_index_allow_collision,
-          rep_->persistent_cache_options,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_key_is_user_key == 0,
-          rep_->table_properties == nullptr ||
-              rep_->table_properties->index_value_is_delta_encoded == 0,
-          GetMemoryAllocator(rep_->table_options));
-    }
-    default: {
-      std::string error_message =
-          "Unrecognized index type: " + ToString(index_type_on_file);
-      return Status::InvalidArgument(error_message.c_str());
-    }
-  }
-}
-
-uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
-      NewIndexIterator(ReadOptions()));
-
-  index_iter->Seek(key);
-  uint64_t result;
-  if (index_iter->Valid()) {
-    BlockHandle handle = index_iter->value();
-    result = handle.offset();
-  } else {
-    // key is past the last key in the file. If table_properties is not
-    // available, approximate the offset by returning the offset of the
-    // metaindex block (which is right near the end of the file).
-    result = 0;
-    if (rep_->table_properties) {
-      result = rep_->table_properties->data_size;
-    }
-    // table_properties is not present in the table.
-    if (result == 0) {
-      result = rep_->footer.metaindex_handle().offset();
-    }
-  }
-  return result;
-}
-
-bool BlockBasedTable::TEST_filter_block_preloaded() const {
-  return rep_->filter != nullptr;
-}
-
-bool BlockBasedTable::TEST_index_reader_preloaded() const {
-  return rep_->index_reader != nullptr;
-}
-
-Status BlockBasedTable::GetKVPairsFromDataBlocks(
-    std::vector<KVPairBlock>* kv_pair_blocks) {
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
-      NewIndexIterator(ReadOptions()));
-
-  Status s = blockhandles_iter->status();
-  if (!s.ok()) {
-    // Cannot read Index Block
-    return s;
-  }
-
-  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
-       blockhandles_iter->Next()) {
-    s = blockhandles_iter->status();
-
-    if (!s.ok()) {
-      break;
-    }
-
-    std::unique_ptr<InternalIterator> datablock_iter;
-    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
-        rep_, ReadOptions(), blockhandles_iter->value()));
-    s = datablock_iter->status();
-
-    if (!s.ok()) {
-      // Error reading the block - Skipped
-      continue;
-    }
-
-    KVPairBlock kv_pair_block;
-    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
-         datablock_iter->Next()) {
-      s = datablock_iter->status();
-      if (!s.ok()) {
-        // Error reading the block - Skipped
-        break;
-      }
-      const Slice& key = datablock_iter->key();
-      const Slice& value = datablock_iter->value();
-      std::string key_copy = std::string(key.data(), key.size());
-      std::string value_copy = std::string(value.data(), value.size());
-
-      kv_pair_block.push_back(
-          std::make_pair(std::move(key_copy), std::move(value_copy)));
-    }
-    kv_pair_blocks->push_back(std::move(kv_pair_block));
-  }
-  return Status::OK();
-}
-
-Status BlockBasedTable::DumpTable(WritableFile* out_file,
-                                  const SliceTransform* prefix_extractor) {
-  // Output Footer
-  out_file->Append(
-      "Footer Details:\n"
-      "--------------------------------------\n"
-      "  ");
-  out_file->Append(rep_->footer.ToString().c_str());
-  out_file->Append("\n");
-
-  // Output MetaIndex
-  out_file->Append(
-      "Metaindex Details:\n"
-      "--------------------------------------\n");
-  std::unique_ptr<Block> meta;
-  std::unique_ptr<InternalIterator> meta_iter;
-  Status s =
-      ReadMetaBlock(rep_, nullptr /* prefetch_buffer */, &meta, &meta_iter);
-  if (s.ok()) {
-    for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
-      s = meta_iter->status();
-      if (!s.ok()) {
-        return s;
-      }
-      if (meta_iter->key() == rocksdb::kPropertiesBlock) {
-        out_file->Append("  Properties block handle: ");
-        out_file->Append(meta_iter->value().ToString(true).c_str());
-        out_file->Append("\n");
-      } else if (meta_iter->key() == rocksdb::kCompressionDictBlock) {
-        out_file->Append("  Compression dictionary block handle: ");
-        out_file->Append(meta_iter->value().ToString(true).c_str());
-        out_file->Append("\n");
-      } else if (strstr(meta_iter->key().ToString().c_str(),
-                        "filter.rocksdb.") != nullptr) {
-        out_file->Append("  Filter block handle: ");
-        out_file->Append(meta_iter->value().ToString(true).c_str());
-        out_file->Append("\n");
-      } else if (meta_iter->key() == rocksdb::kRangeDelBlock) {
-        out_file->Append("  Range deletion block handle: ");
-        out_file->Append(meta_iter->value().ToString(true).c_str());
-        out_file->Append("\n");
-      }
-    }
-    out_file->Append("\n");
-  } else {
-    return s;
-  }
-
-  // Output TableProperties
-  const rocksdb::TableProperties* table_properties;
-  table_properties = rep_->table_properties.get();
-
-  if (table_properties != nullptr) {
-    out_file->Append(
-        "Table Properties:\n"
-        "--------------------------------------\n"
-        "  ");
-    out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
-    out_file->Append("\n");
-
-    // Output Filter blocks
-    if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
-      // Support only BloomFilter as off now
-      rocksdb::BlockBasedTableOptions table_options;
-      table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
-      if (table_properties->filter_policy_name.compare(
-              table_options.filter_policy->Name()) == 0) {
-        std::string filter_block_key = kFilterBlockPrefix;
-        filter_block_key.append(table_properties->filter_policy_name);
-        BlockHandle handle;
-        if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
-          BlockContents block;
-          BlockFetcher block_fetcher(
-              rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
-              ReadOptions(), handle, &block, rep_->ioptions,
-              false /*decompress*/, false /*maybe_compressed*/,
-              UncompressionDict::GetEmptyDict(),
-              rep_->persistent_cache_options);
-          s = block_fetcher.ReadBlockContents();
-          if (!s.ok()) {
-            rep_->filter.reset(new BlockBasedFilterBlockReader(
-                prefix_extractor, table_options,
-                table_options.whole_key_filtering, std::move(block),
-                rep_->ioptions.statistics));
-          }
-        }
-      }
-    }
-  }
-  if (rep_->filter) {
-    out_file->Append(
-        "Filter Details:\n"
-        "--------------------------------------\n"
-        "  ");
-    out_file->Append(rep_->filter->ToString().c_str());
-    out_file->Append("\n");
-  }
-
-  // Output Index block
-  s = DumpIndexBlock(out_file);
-  if (!s.ok()) {
-    return s;
-  }
-
-  // Output compression dictionary
-  if (!rep_->compression_dict_handle.IsNull()) {
-    std::unique_ptr<const BlockContents> compression_dict_block;
-    s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */,
-                                 &compression_dict_block);
-    if (!s.ok()) {
-      return s;
-    }
-    assert(compression_dict_block != nullptr);
-    auto compression_dict = compression_dict_block->data;
-    out_file->Append(
-        "Compression Dictionary:\n"
-        "--------------------------------------\n");
-    out_file->Append("  size (bytes): ");
-    out_file->Append(rocksdb::ToString(compression_dict.size()));
-    out_file->Append("\n\n");
-    out_file->Append("  HEX    ");
-    out_file->Append(compression_dict.ToString(true).c_str());
-    out_file->Append("\n\n");
-  }
-
-  // Output range deletions block
-  auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions());
-  if (range_del_iter != nullptr) {
-    range_del_iter->SeekToFirst();
-    if (range_del_iter->Valid()) {
-      out_file->Append(
-          "Range deletions:\n"
-          "--------------------------------------\n"
-          "  ");
-      for (; range_del_iter->Valid(); range_del_iter->Next()) {
-        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_file);
-      }
-      out_file->Append("\n");
-    }
-    delete range_del_iter;
-  }
-  // Output Data blocks
-  s = DumpDataBlocks(out_file);
-
-  return s;
-}
-
-void BlockBasedTable::Close() {
-  if (rep_->closed) {
-    return;
-  }
-
-  Cache* const cache = rep_->table_options.block_cache.get();
-
-  rep_->filter_entry.Release(cache);
-  rep_->index_entry.Release(cache);
-
-  // cleanup index, filter, and compression dictionary blocks
-  // to avoid accessing dangling pointers
-  if (!rep_->table_options.no_block_cache) {
-    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-
-    // Get the filter block key
-    auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                           rep_->filter_handle, cache_key);
-    cache->Erase(key);
-
-    // Get the index block key
-    key = GetCacheKeyFromOffset(rep_->cache_key_prefix,
-                                rep_->cache_key_prefix_size,
-                                rep_->dummy_index_reader_offset, cache_key);
-    cache->Erase(key);
-
-    if (!rep_->compression_dict_handle.IsNull()) {
-      // Get the compression dictionary block key
-      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                        rep_->compression_dict_handle, cache_key);
-      cache->Erase(key);
-    }
-  }
-
-  rep_->closed = true;
-}
-
-Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
-  out_file->Append(
-      "Index Details:\n"
-      "--------------------------------------\n");
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
-      NewIndexIterator(ReadOptions()));
-  Status s = blockhandles_iter->status();
-  if (!s.ok()) {
-    out_file->Append("Can not read Index Block \n\n");
-    return s;
-  }
-
-  out_file->Append("  Block key hex dump: Data block handle\n");
-  out_file->Append("  Block key ascii\n\n");
-  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
-       blockhandles_iter->Next()) {
-    s = blockhandles_iter->status();
-    if (!s.ok()) {
-      break;
-    }
-    Slice key = blockhandles_iter->key();
-    Slice user_key;
-    InternalKey ikey;
-    if (rep_->table_properties &&
-        rep_->table_properties->index_key_is_user_key != 0) {
-      user_key = key;
-    } else {
-      ikey.DecodeFrom(key);
-      user_key = ikey.user_key();
-    }
-
-    out_file->Append("  HEX    ");
-    out_file->Append(user_key.ToString(true).c_str());
-    out_file->Append(": ");
-    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
-    out_file->Append("\n");
-
-    std::string str_key = user_key.ToString();
-    std::string res_key("");
-    char cspace = ' ';
-    for (size_t i = 0; i < str_key.size(); i++) {
-      res_key.append(&str_key[i], 1);
-      res_key.append(1, cspace);
-    }
-    out_file->Append("  ASCII  ");
-    out_file->Append(res_key.c_str());
-    out_file->Append("\n  ------\n");
-  }
-  out_file->Append("\n");
-  return Status::OK();
-}
-
-Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
-  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
-      NewIndexIterator(ReadOptions()));
-  Status s = blockhandles_iter->status();
-  if (!s.ok()) {
-    out_file->Append("Can not read Index Block \n\n");
-    return s;
-  }
-
-  uint64_t datablock_size_min = std::numeric_limits<uint64_t>::max();
-  uint64_t datablock_size_max = 0;
-  uint64_t datablock_size_sum = 0;
-
-  size_t block_id = 1;
-  for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
-       block_id++, blockhandles_iter->Next()) {
-    s = blockhandles_iter->status();
-    if (!s.ok()) {
-      break;
-    }
-
-    BlockHandle bh = blockhandles_iter->value();
-    uint64_t datablock_size = bh.size();
-    datablock_size_min = std::min(datablock_size_min, datablock_size);
-    datablock_size_max = std::max(datablock_size_max, datablock_size);
-    datablock_size_sum += datablock_size;
-
-    out_file->Append("Data Block # ");
-    out_file->Append(rocksdb::ToString(block_id));
-    out_file->Append(" @ ");
-    out_file->Append(blockhandles_iter->value().ToString(true).c_str());
-    out_file->Append("\n");
-    out_file->Append("--------------------------------------\n");
-
-    std::unique_ptr<InternalIterator> datablock_iter;
-    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
-        rep_, ReadOptions(), blockhandles_iter->value()));
-    s = datablock_iter->status();
-
-    if (!s.ok()) {
-      out_file->Append("Error reading the block - Skipped \n\n");
-      continue;
-    }
-
-    for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
-         datablock_iter->Next()) {
-      s = datablock_iter->status();
-      if (!s.ok()) {
-        out_file->Append("Error reading the block - Skipped \n");
-        break;
-      }
-      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_file);
-    }
-    out_file->Append("\n");
-  }
-
-  uint64_t num_datablocks = block_id - 1;
-  if (num_datablocks) {
-    double datablock_size_avg =
-        static_cast<double>(datablock_size_sum) / num_datablocks;
-    out_file->Append("Data Block Summary:\n");
-    out_file->Append("--------------------------------------");
-    out_file->Append("\n  # data blocks: ");
-    out_file->Append(rocksdb::ToString(num_datablocks));
-    out_file->Append("\n  min data block size: ");
-    out_file->Append(rocksdb::ToString(datablock_size_min));
-    out_file->Append("\n  max data block size: ");
-    out_file->Append(rocksdb::ToString(datablock_size_max));
-    out_file->Append("\n  avg data block size: ");
-    out_file->Append(rocksdb::ToString(datablock_size_avg));
-    out_file->Append("\n");
-  }
-
-  return Status::OK();
-}
-
-void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
-                                   WritableFile* out_file) {
-  InternalKey ikey;
-  ikey.DecodeFrom(key);
-
-  out_file->Append("  HEX    ");
-  out_file->Append(ikey.user_key().ToString(true).c_str());
-  out_file->Append(": ");
-  out_file->Append(value.ToString(true).c_str());
-  out_file->Append("\n");
-
-  std::string str_key = ikey.user_key().ToString();
-  std::string str_value = value.ToString();
-  std::string res_key(""), res_value("");
-  char cspace = ' ';
-  for (size_t i = 0; i < str_key.size(); i++) {
-    if (str_key[i] == '\0') {
-      res_key.append("\\0", 2);
-    } else {
-      res_key.append(&str_key[i], 1);
-    }
-    res_key.append(1, cspace);
-  }
-  for (size_t i = 0; i < str_value.size(); i++) {
-    if (str_value[i] == '\0') {
-      res_value.append("\\0", 2);
-    } else {
-      res_value.append(&str_value[i], 1);
-    }
-    res_value.append(1, cspace);
-  }
-
-  out_file->Append("  ASCII  ");
-  out_file->Append(res_key.c_str());
-  out_file->Append(": ");
-  out_file->Append(res_value.c_str());
-  out_file->Append("\n  ------\n");
-}
-
-namespace {
-
-void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
-  FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
-  if (filter->statistics() != nullptr) {
-    RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
-               filter->ApproximateMemoryUsage());
-  }
-  delete filter;
-}
-
-void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) {
-  IndexReader* index_reader = reinterpret_cast<IndexReader*>(value);
-  if (index_reader->statistics() != nullptr) {
-    RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT,
-               index_reader->ApproximateMemoryUsage());
-  }
-  delete index_reader;
-}
-
-void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) {
-  UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value);
-  RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
-             dict->ApproximateMemoryUsage());
-  delete dict;
-}
-
-}  // anonymous namespace
-
-}  // namespace rocksdb
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 1f209210c13..3e9f6ff3f04 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -9,22 +9,20 @@
 
 #include "table/block_fetcher.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 
+#include "logging/logging.h"
+#include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
-#include "monitoring/statistics.h"
 #include "rocksdb/env.h"
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/format.h"
 #include "table/persistent_cache_helper.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
-#include "util/logging.h"
-#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
@@ -93,7 +91,8 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
   if (prefetch_buffer_ != nullptr &&
       prefetch_buffer_->TryReadFromCache(
           handle_.offset(),
-          static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_)) {
+          static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_,
+          for_compaction_)) {
     block_size_ = static_cast<size_t>(handle_.size());
     CheckBlockChecksum();
     if (!status_.ok()) {
@@ -217,9 +216,29 @@ Status BlockFetcher::ReadBlockContents() {
       PERF_TIMER_GUARD(block_read_time);
       // Actual file read
       status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize,
-                            &slice_, used_buf_);
+                            &slice_, used_buf_, for_compaction_);
     }
     PERF_COUNTER_ADD(block_read_count, 1);
+
+    // TODO: introduce dedicated perf counter for range tombstones
+    switch (block_type_) {
+      case BlockType::kFilter:
+        PERF_COUNTER_ADD(filter_block_read_count, 1);
+        break;
+
+      case BlockType::kCompressionDictionary:
+        PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
+        break;
+
+      case BlockType::kIndex:
+        PERF_COUNTER_ADD(index_block_read_count, 1);
+        break;
+
+      // Nothing to do here as we don't have counters for the other types.
+      default:
+        break;
+    }
+
     PERF_COUNTER_ADD(block_read_byte, block_size_ + kBlockTrailerSize);
     if (!status_.ok()) {
       return status_;
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index b5fee941597..f67c974becb 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -8,28 +8,44 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include "table/block.h"
+#include "memory/memory_allocator.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_type.h"
 #include "table/format.h"
-#include "util/memory_allocator.h"
 
 namespace rocksdb {
+
+// Retrieves a single block of a given file. Utilizes the prefetch buffer and/or
+// persistent cache provided (if any) to try to avoid reading from the file
+// directly. Note that both the prefetch buffer and the persistent cache are
+// optional; also, note that the persistent cache may be configured to store either
+// compressed or uncompressed blocks.
+//
+// If the retrieved block is compressed and the do_uncompress flag is set,
+// BlockFetcher uncompresses the block (using the uncompression dictionary,
+// if provided, to prime the compression algorithm), and returns the resulting
+// uncompressed block data. Otherwise, it returns the original block.
+//
+// Two read options affect the behavior of BlockFetcher: if verify_checksums is
+// true, the checksum of the (original) block is checked; if fill_cache is true,
+// the block is added to the persistent cache if needed.
+//
+// Memory for uncompressed and compressed blocks is allocated as needed
+// using memory_allocator and memory_allocator_compressed, respectively
+// (if provided; otherwise, the default allocator is used).
+
 class BlockFetcher {
  public:
-  // Read the block identified by "handle" from "file".
-  // The only relevant option is options.verify_checksums for now.
-  // On failure return non-OK.
-  // On success fill *result and return OK - caller owns *result
-  // @param uncompression_dict Data for presetting the compression library's
-  //    dictionary.
   BlockFetcher(RandomAccessFileReader* file,
                FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
                const ReadOptions& read_options, const BlockHandle& handle,
                BlockContents* contents, const ImmutableCFOptions& ioptions,
-               bool do_uncompress, bool maybe_compressed,
+               bool do_uncompress, bool maybe_compressed, BlockType block_type,
                const UncompressionDict& uncompression_dict,
                const PersistentCacheOptions& cache_options,
                MemoryAllocator* memory_allocator = nullptr,
-               MemoryAllocator* memory_allocator_compressed = nullptr)
+               MemoryAllocator* memory_allocator_compressed = nullptr,
+               bool for_compaction = false)
       : file_(file),
         prefetch_buffer_(prefetch_buffer),
         footer_(footer),
@@ -39,10 +55,13 @@ class BlockFetcher {
         ioptions_(ioptions),
         do_uncompress_(do_uncompress),
         maybe_compressed_(maybe_compressed),
+        block_type_(block_type),
         uncompression_dict_(uncompression_dict),
         cache_options_(cache_options),
         memory_allocator_(memory_allocator),
-        memory_allocator_compressed_(memory_allocator_compressed) {}
+        memory_allocator_compressed_(memory_allocator_compressed),
+        for_compaction_(for_compaction) {}
+
   Status ReadBlockContents();
   CompressionType get_compression_type() const { return compression_type_; }
 
@@ -58,6 +77,7 @@ class BlockFetcher {
   const ImmutableCFOptions& ioptions_;
   bool do_uncompress_;
   bool maybe_compressed_;
+  BlockType block_type_;
   const UncompressionDict& uncompression_dict_;
   const PersistentCacheOptions& cache_options_;
   MemoryAllocator* memory_allocator_;
@@ -71,6 +91,7 @@ class BlockFetcher {
   char stack_buf_[kDefaultStackBufferSize];
   bool got_from_prefetch_buffer_ = false;
   rocksdb::CompressionType compression_type_;
+  bool for_compaction_ = false;
 
   // return true if found
   bool TryGetUncompressBlockFromPersistentCache();
diff --git a/table/bloom_block.cc b/table/bloom_block.cc
deleted file mode 100644
index 61959030a22..00000000000
--- a/table/bloom_block.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "table/bloom_block.h"
-
-#include <string>
-#include "rocksdb/slice.h"
-#include "util/dynamic_bloom.h"
-
-namespace rocksdb {
-
-void BloomBlockBuilder::AddKeysHashes(const std::vector<uint32_t>& keys_hashes) {
-  for (auto hash : keys_hashes) {
-    bloom_.AddHash(hash);
-  }
-}
-
-Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
-
-const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
-}  // namespace rocksdb
diff --git a/table/bloom_block.h b/table/bloom_block.h
deleted file mode 100644
index 483fa25d93d..00000000000
--- a/table/bloom_block.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#pragma once
-
-#include <vector>
-#include <string>
-#include "util/dynamic_bloom.h"
-
-namespace rocksdb {
-class Logger;
-
-class BloomBlockBuilder {
- public:
-  static const std::string kBloomBlock;
-
-  explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
-
-  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
-                    uint32_t locality, size_t huge_page_tlb_size,
-                    Logger* logger) {
-    bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
-                        logger);
-  }
-
-  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
-
-  void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
-
-  Slice Finish();
-
- private:
-  DynamicBloom bloom_;
-};
-
-};  // namespace rocksdb
diff --git a/table/cleanable_test.cc b/table/cleanable_test.cc
index f18c33b8399..8478adf523d 100644
--- a/table/cleanable_test.cc
+++ b/table/cleanable_test.cc
@@ -9,8 +9,8 @@
 #include "port/stack_trace.h"
 #include "rocksdb/iostats_context.h"
 #include "rocksdb/perf_context.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc
similarity index 99%
rename from table/cuckoo_table_builder.cc
rename to table/cuckoo/cuckoo_table_builder.cc
index f590e6ad405..8857cf7ea9f 100644
--- a/table/cuckoo_table_builder.cc
+++ b/table/cuckoo/cuckoo_table_builder.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#include "table/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
 
 #include <assert.h>
 #include <algorithm>
@@ -13,14 +13,14 @@
 #include <vector>
 
 #include "db/dbformat.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
-#include "table/block_builder.h"
-#include "table/cuckoo_table_factory.h"
+#include "table/block_based/block_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "util/autovector.h"
-#include "util/file_reader_writer.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
diff --git a/table/cuckoo_table_builder.h b/table/cuckoo/cuckoo_table_builder.h
similarity index 99%
rename from table/cuckoo_table_builder.h
rename to table/cuckoo/cuckoo_table_builder.h
index 3829541b39a..c42744de019 100644
--- a/table/cuckoo_table_builder.h
+++ b/table/cuckoo/cuckoo_table_builder.h
@@ -30,6 +30,9 @@ class CuckooTableBuilder: public TableBuilder {
                                                 uint64_t),
                      uint32_t column_family_id,
                      const std::string& column_family_name);
+  // No copying allowed
+  CuckooTableBuilder(const CuckooTableBuilder&) = delete;
+  void operator=(const CuckooTableBuilder&) = delete;
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~CuckooTableBuilder() {}
@@ -116,10 +119,6 @@ class CuckooTableBuilder: public TableBuilder {
   std::string smallest_user_key_ = "";
 
   bool closed_;  // Either Finish() or Abandon() has been called.
-
-  // No copying allowed
-  CuckooTableBuilder(const CuckooTableBuilder&) = delete;
-  void operator=(const CuckooTableBuilder&) = delete;
 };
 
 }  // namespace rocksdb
diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc
similarity index 99%
rename from table/cuckoo_table_builder_test.cc
rename to table/cuckoo/cuckoo_table_builder_test.cc
index c1e350327f3..b84cc9f5bcf 100644
--- a/table/cuckoo_table_builder_test.cc
+++ b/table/cuckoo/cuckoo_table_builder_test.cc
@@ -10,11 +10,12 @@
 #include <map>
 #include <utility>
 
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
 #include "table/meta_blocks.h"
-#include "table/cuckoo_table_builder.h"
-#include "util/file_reader_writer.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 extern const uint64_t kCuckooTableMagicNumber;
diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo/cuckoo_table_factory.cc
similarity index 94%
rename from table/cuckoo_table_factory.cc
rename to table/cuckoo/cuckoo_table_factory.cc
index 74d18d51213..4ca29f364cf 100644
--- a/table/cuckoo_table_factory.cc
+++ b/table/cuckoo/cuckoo_table_factory.cc
@@ -4,11 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#include "table/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
 
 #include "db/dbformat.h"
-#include "table/cuckoo_table_builder.h"
-#include "table/cuckoo_table_reader.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
 
 namespace rocksdb {
 
diff --git a/table/cuckoo_table_factory.h b/table/cuckoo/cuckoo_table_factory.h
similarity index 100%
rename from table/cuckoo_table_factory.h
rename to table/cuckoo/cuckoo_table_factory.h
diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc
similarity index 98%
rename from table/cuckoo_table_reader.cc
rename to table/cuckoo/cuckoo_table_reader.cc
index f4df2467fdb..982b14763f4 100644
--- a/table/cuckoo_table_reader.cc
+++ b/table/cuckoo/cuckoo_table_reader.cc
@@ -8,20 +8,20 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #ifndef ROCKSDB_LITE
-#include "table/cuckoo_table_reader.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
 
 #include <algorithm>
 #include <limits>
 #include <string>
 #include <utility>
 #include <vector>
+#include "memory/arena.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/get_context.h"
 #include "table/internal_iterator.h"
 #include "table/meta_blocks.h"
-#include "table/cuckoo_table_factory.h"
-#include "table/get_context.h"
-#include "util/arena.h"
 #include "util/coding.h"
 
 namespace rocksdb {
@@ -197,6 +197,9 @@ void CuckooTableReader::Prepare(const Slice& key) {
 class CuckooTableIterator : public InternalIterator {
  public:
   explicit CuckooTableIterator(CuckooTableReader* reader);
+  // No copying allowed
+  CuckooTableIterator(const CuckooTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
   ~CuckooTableIterator() override {}
   bool Valid() const override;
   void SeekToFirst() override;
@@ -248,9 +251,6 @@ class CuckooTableIterator : public InternalIterator {
   uint32_t curr_key_idx_;
   Slice curr_value_;
   IterKey curr_key_;
-  // No copying allowed
-  CuckooTableIterator(const CuckooTableIterator&) = delete;
-  void operator=(const Iterator&) = delete;
 };
 
 CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
@@ -377,7 +377,8 @@ Slice CuckooTableIterator::value() const {
 InternalIterator* CuckooTableReader::NewIterator(
     const ReadOptions& /*read_options*/,
     const SliceTransform* /* prefix_extractor */, Arena* arena,
-    bool /*skip_filters*/, bool /*for_compaction*/) {
+    bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/) {
   if (!status().ok()) {
     return NewErrorInternalIterator<Slice>(
         Status::Corruption("CuckooTableReader status is not okay."), arena);
diff --git a/table/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h
similarity index 79%
rename from table/cuckoo_table_reader.h
rename to table/cuckoo/cuckoo_table_reader.h
index b37d46373e1..fb0445bcfb3 100644
--- a/table/cuckoo_table_reader.h
+++ b/table/cuckoo/cuckoo_table_reader.h
@@ -15,11 +15,11 @@
 #include <vector>
 
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "table/table_reader.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -45,18 +45,30 @@ class CuckooTableReader: public TableReader {
              GetContext* get_context, const SliceTransform* prefix_extractor,
              bool skip_filters = false) override;
 
+  // Returns a new iterator over table contents
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
-                                Arena* arena = nullptr,
-                                bool skip_filters = false,
-                                bool for_compaction = false) override;
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0) override;
   void Prepare(const Slice& target) override;
 
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const override;
 
   // Following methods are not implemented for Cuckoo Table Reader
-  uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; }
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+                               TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+                           TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
   void SetupForCompaction() override {}
   // End of methods not implemented.
 
diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc
similarity index 95%
rename from table/cuckoo_table_reader_test.cc
rename to table/cuckoo/cuckoo_table_reader_test.cc
index 74fb52e6c78..2dfe887ba81 100644
--- a/table/cuckoo_table_reader_test.cc
+++ b/table/cuckoo/cuckoo_table_reader_test.cc
@@ -13,26 +13,22 @@ int main() {
 }
 #else
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
-#include <vector>
-#include <string>
+#include <cinttypes>
 #include <map>
+#include <string>
+#include <vector>
 
-#include "table/cuckoo_table_builder.h"
-#include "table/cuckoo_table_factory.h"
-#include "table/cuckoo_table_reader.h"
+#include "memory/arena.h"
+#include "table/cuckoo/cuckoo_table_builder.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
 #include "table/get_context.h"
 #include "table/meta_blocks.h"
-#include "util/arena.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
@@ -126,7 +122,7 @@ class CuckooReaderTest : public testing::Test {
       PinnableSlice value;
       GetContext get_context(ucomp, nullptr, nullptr, nullptr,
                              GetContext::kNotFound, Slice(user_keys[i]), &value,
-                             nullptr, nullptr, nullptr, nullptr);
+                             nullptr, nullptr, true, nullptr, nullptr);
       ASSERT_OK(
           reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr));
       ASSERT_STREQ(values[i].c_str(), value.data());
@@ -150,8 +146,9 @@ class CuckooReaderTest : public testing::Test {
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
                              GetSliceHash);
     ASSERT_OK(reader.status());
-    InternalIterator* it =
-        reader.NewIterator(ReadOptions(), nullptr, nullptr, false);
+    InternalIterator* it = reader.NewIterator(
+        ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
     ASSERT_OK(it->status());
     ASSERT_TRUE(!it->Valid());
     it->SeekToFirst();
@@ -190,7 +187,9 @@ class CuckooReaderTest : public testing::Test {
     delete it;
 
     Arena arena;
-    it = reader.NewIterator(ReadOptions(), nullptr, &arena);
+    it = reader.NewIterator(ReadOptions(), /*prefix_extractor=*/nullptr, &arena,
+                            /*skip_filters=*/false,
+                            TableReaderCaller::kUncategorized);
     ASSERT_OK(it->status());
     ASSERT_TRUE(!it->Valid());
     it->Seek(keys[num_items/2]);
@@ -337,7 +336,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   AppendInternalKey(&not_found_key, ikey);
   PinnableSlice value;
   GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
-                         Slice(not_found_key), &value, nullptr, nullptr,
+                         Slice(not_found_key), &value, nullptr, nullptr, true,
                          nullptr, nullptr);
   ASSERT_OK(
       reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr));
@@ -352,7 +351,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   value.Reset();
   GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
                           GetContext::kNotFound, Slice(not_found_key2), &value,
-                          nullptr, nullptr, nullptr, nullptr);
+                          nullptr, nullptr, true, nullptr, nullptr);
   ASSERT_OK(
       reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr));
   ASSERT_TRUE(value.empty());
@@ -368,7 +367,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   value.Reset();
   GetContext get_context3(ucmp, nullptr, nullptr, nullptr,
                           GetContext::kNotFound, Slice(unused_key), &value,
-                          nullptr, nullptr, nullptr, nullptr);
+                          nullptr, nullptr, true, nullptr, nullptr);
   ASSERT_OK(
       reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr));
   ASSERT_TRUE(value.empty());
@@ -444,7 +443,7 @@ void WriteFile(const std::vector<std::string>& keys,
   // Assume only the fast path is triggered
   GetContext get_context(nullptr, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, Slice(), &value, nullptr,
-                         nullptr, nullptr, nullptr);
+                         nullptr, true, nullptr, nullptr);
   for (uint64_t i = 0; i < num; ++i) {
     value.Reset();
     value.clear();
@@ -492,7 +491,7 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
   // Assume only the fast path is triggered
   GetContext get_context(nullptr, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, Slice(), &value, nullptr,
-                         nullptr, nullptr, nullptr);
+                         nullptr, true, nullptr, nullptr);
   uint64_t start_time = env->NowMicros();
   if (batch_size > 0) {
     for (uint64_t i = 0; i < num; i += batch_size) {
diff --git a/table/format.cc b/table/format.cc
index 476db85f731..5e9805f4027 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -9,25 +9,24 @@
 
 #include "table/format.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 
+#include "block_fetcher.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/env.h"
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
-#include "table/block_fetcher.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/persistent_cache_helper.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
-#include "util/logging.h"
-#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/xxhash.h"
 
 namespace rocksdb {
 
@@ -91,6 +90,58 @@ std::string BlockHandle::ToString(bool hex) const {
 
 const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
 
+void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
+                          const BlockHandle* previous_handle) const {
+  if (previous_handle) {
+    assert(handle.offset() == previous_handle->offset() +
+                                  previous_handle->size() + kBlockTrailerSize);
+    PutVarsignedint64(dst, handle.size() - previous_handle->size());
+  } else {
+    handle.EncodeTo(dst);
+  }
+  assert(dst->size() != 0);
+
+  if (have_first_key) {
+    PutLengthPrefixedSlice(dst, first_internal_key);
+  }
+}
+
+Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
+                              const BlockHandle* previous_handle) {
+  if (previous_handle) {
+    int64_t delta;
+    if (!GetVarsignedint64(input, &delta)) {
+      return Status::Corruption("bad delta-encoded index value");
+    }
+    handle = BlockHandle(
+        previous_handle->offset() + previous_handle->size() + kBlockTrailerSize,
+        previous_handle->size() + delta);
+  } else {
+    Status s = handle.DecodeFrom(input);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!have_first_key) {
+    first_internal_key = Slice();
+  } else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
+    return Status::Corruption("bad first key in block info");
+  }
+
+  return Status::OK();
+}
+
+std::string IndexValue::ToString(bool hex, bool have_first_key) const {
+  std::string s;
+  EncodeTo(&s, have_first_key, nullptr);
+  if (hex) {
+    return Slice(s).ToString(true);
+  } else {
+    return s;
+  }
+}
+
 namespace {
 inline bool IsLegacyFooterFormat(uint64_t magic_number) {
   return magic_number == kLegacyBlockBasedTableMagicNumber ||
diff --git a/table/format.h b/table/format.h
index f5858850559..2ed80e2fc35 100644
--- a/table/format.h
+++ b/table/format.h
@@ -10,23 +10,19 @@
 #pragma once
 #include <stdint.h>
 #include <string>
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
+#include "file/file_prefetch_buffer.h"
+#include "file/random_access_file_reader.h"
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 
+#include "memory/memory_allocator.h"
 #include "options/cf_options.h"
+#include "port/malloc.h"
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
-#include "util/file_reader_writer.h"
-#include "util/memory_allocator.h"
 
 namespace rocksdb {
 
@@ -76,6 +72,35 @@ class BlockHandle {
   static const BlockHandle kNullBlockHandle;
 };
 
+// Value in block-based table file index.
+//
+// The index entry for block n is: y -> h, [x],
+// where: y is some key between the last key of block n (inclusive) and the
+// first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
+// x, if present, is the first key of block n (unshortened).
+// This struct represents the "h, [x]" part.
+struct IndexValue {
+  BlockHandle handle;
+  // Empty means unknown.
+  Slice first_internal_key;
+
+  IndexValue() = default;
+  IndexValue(BlockHandle _handle, Slice _first_internal_key)
+      : handle(_handle), first_internal_key(_first_internal_key) {}
+
+  // have_first_key indicates whether the `first_internal_key` is used.
+  // If previous_handle is not null, delta encoding is used;
+  // in this case, the two handles must point to consecutive blocks:
+  // handle.offset() ==
+  //     previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
+  void EncodeTo(std::string* dst, bool have_first_key,
+                const BlockHandle* previous_handle) const;
+  Status DecodeFrom(Slice* input, bool have_first_key,
+                    const BlockHandle* previous_handle);
+
+  std::string ToString(bool hex, bool have_first_key) const;
+};
+
 inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
                                             uint32_t version) {
 #ifdef NDEBUG
@@ -92,7 +117,7 @@ inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
 }
 
 inline bool BlockBasedTableSupportedVersion(uint32_t version) {
-  return version <= 4;
+  return version <= 5;
 }
 
 // Footer encapsulates the fixed information stored at the tail
@@ -189,11 +214,20 @@ Status ReadFooterFromFile(RandomAccessFileReader* file,
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
 
+// Make block size calculation for IO less error prone
+inline uint64_t block_size(const BlockHandle& handle) {
+  return handle.size() + kBlockTrailerSize;
+}
+
 inline CompressionType get_block_compression_type(const char* block_data,
                                                   size_t block_size) {
   return static_cast<CompressionType>(block_data[block_size]);
 }
 
+// Represents the contents of a block read from an SST file. Depending on how
+// it's created, it may or may not own the actual block bytes. As an example,
+// BlockContents objects representing data read from mmapped files only point
+// into the mmapped region.
 struct BlockContents {
   Slice data;  // Actual contents of data
   CacheAllocationPtr allocation;
@@ -206,16 +240,20 @@ struct BlockContents {
 
   BlockContents() {}
 
+  // Does not take ownership of the underlying data bytes.
   BlockContents(const Slice& _data) : data(_data) {}
 
+  // Takes ownership of the underlying data bytes.
   BlockContents(CacheAllocationPtr&& _data, size_t _size)
       : data(_data.get(), _size), allocation(std::move(_data)) {}
 
+  // Takes ownership of the underlying data bytes.
   BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
       : data(_data.get(), _size) {
     allocation.reset(_data.release());
   }
 
+  // Returns whether the object has ownership of the underlying data bytes.
   bool own_bytes() const { return allocation.get() != nullptr; }
 
   // It's the caller's responsibility to make sure that this is
diff --git a/table/full_filter_bits_builder.h b/table/full_filter_bits_builder.h
deleted file mode 100644
index 851ed1e2ab4..00000000000
--- a/table/full_filter_bits_builder.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "rocksdb/filter_policy.h"
-
-namespace rocksdb {
-
-class Slice;
-
-class FullFilterBitsBuilder : public FilterBitsBuilder {
- public:
-  explicit FullFilterBitsBuilder(const size_t bits_per_key,
-                                 const size_t num_probes);
-
-  ~FullFilterBitsBuilder();
-
-  virtual void AddKey(const Slice& key) override;
-
-  // Create a filter that for hashes [0, n-1], the filter is allocated here
-  // When creating filter, it is ensured that
-  // total_bits = num_lines * CACHE_LINE_SIZE * 8
-  // dst len is >= 5, 1 for num_probes, 4 for num_lines
-  // Then total_bits = (len - 5) * 8, and cache_line_size could be calculated
-  // +----------------------------------------------------------------+
-  // |              filter data with length total_bits/8              |
-  // +----------------------------------------------------------------+
-  // |                                                                |
-  // | ...                                                            |
-  // |                                                                |
-  // +----------------------------------------------------------------+
-  // | ...                | num_probes : 1 byte | num_lines : 4 bytes |
-  // +----------------------------------------------------------------+
-  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override;
-
-  // Calculate num of entries fit into a space.
-  virtual int CalculateNumEntry(const uint32_t space) override;
-
-  // Calculate space for new filter. This is reverse of CalculateNumEntry.
-  uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
-                          uint32_t* num_lines);
-
- private:
-  friend class FullFilterBlockTest_DuplicateEntries_Test;
-  size_t bits_per_key_;
-  size_t num_probes_;
-  std::vector<uint32_t> hash_entries_;
-
-  // Get totalbits that optimized for cpu cache line
-  uint32_t GetTotalBitsForLocality(uint32_t total_bits);
-
-  // Reserve space for new filter
-  char* ReserveSpace(const int num_entry, uint32_t* total_bits,
-                     uint32_t* num_lines);
-
-  // Assuming single threaded access to this function.
-  void AddHash(uint32_t h, char* data, uint32_t num_lines, uint32_t total_bits);
-
-  // No Copy allowed
-  FullFilterBitsBuilder(const FullFilterBitsBuilder&);
-  void operator=(const FullFilterBitsBuilder&);
-};
-
-}  // namespace rocksdb
diff --git a/table/full_filter_block.h b/table/full_filter_block.h
deleted file mode 100644
index f97952a7ced..00000000000
--- a/table/full_filter_block.h
+++ /dev/null
@@ -1,144 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-#include <memory>
-#include <string>
-#include <vector>
-#include "rocksdb/options.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/slice_transform.h"
-#include "db/dbformat.h"
-#include "util/hash.h"
-#include "table/filter_block.h"
-
-namespace rocksdb {
-
-class FilterPolicy;
-class FilterBitsBuilder;
-class FilterBitsReader;
-
-// A FullFilterBlockBuilder is used to construct a full filter for a
-// particular Table.  It generates a single string which is stored as
-// a special block in the Table.
-// The format of full filter block is:
-// +----------------------------------------------------------------+
-// |              full filter for all keys in sst file              |
-// +----------------------------------------------------------------+
-// The full filter can be very large. At the end of it, we put
-// num_probes: how many hash functions are used in bloom filter
-//
-class FullFilterBlockBuilder : public FilterBlockBuilder {
- public:
-  explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor,
-                                  bool whole_key_filtering,
-                                  FilterBitsBuilder* filter_bits_builder);
-  // bits_builder is created in filter_policy, it should be passed in here
-  // directly. and be deleted here
-  ~FullFilterBlockBuilder() {}
-
-  virtual bool IsBlockBased() override { return false; }
-  virtual void StartBlock(uint64_t /*block_offset*/) override {}
-  virtual void Add(const Slice& key) override;
-  virtual size_t NumAdded() const override { return num_added_; }
-  virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
-  using FilterBlockBuilder::Finish;
-
- protected:
-  virtual void AddKey(const Slice& key);
-  std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
-  virtual void Reset();
-
- private:
-  // important: all of these might point to invalid addresses
-  // at the time of destruction of this filter block. destructor
-  // should NOT dereference them.
-  const SliceTransform* prefix_extractor_;
-  bool whole_key_filtering_;
-  bool last_whole_key_recorded_;
-  std::string last_whole_key_str_;
-  bool last_prefix_recorded_;
-  std::string last_prefix_str_;
-
-  uint32_t num_added_;
-  std::unique_ptr<const char[]> filter_data_;
-
-  void AddPrefix(const Slice& key);
-
-  // No copying allowed
-  FullFilterBlockBuilder(const FullFilterBlockBuilder&);
-  void operator=(const FullFilterBlockBuilder&);
-};
-
-// A FilterBlockReader is used to parse filter from SST table.
-// KeyMayMatch and PrefixMayMatch would trigger filter checking
-class FullFilterBlockReader : public FilterBlockReader {
- public:
-  // REQUIRES: "contents" and filter_bits_reader must stay live
-  // while *this is live.
-  explicit FullFilterBlockReader(const SliceTransform* prefix_extractor,
-                                 bool whole_key_filtering,
-                                 const Slice& contents,
-                                 FilterBitsReader* filter_bits_reader,
-                                 Statistics* statistics);
-  explicit FullFilterBlockReader(const SliceTransform* prefix_extractor,
-                                 bool whole_key_filtering,
-                                 BlockContents&& contents,
-                                 FilterBitsReader* filter_bits_reader,
-                                 Statistics* statistics);
-
-  // bits_reader is created in filter_policy, it should be passed in here
-  // directly. and be deleted here
-  ~FullFilterBlockReader() {}
-
-  virtual bool IsBlockBased() override { return false; }
-
-  virtual bool KeyMayMatch(
-      const Slice& key, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-
-  virtual bool PrefixMayMatch(
-      const Slice& prefix, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-
-  virtual void KeysMayMatch(MultiGetRange* range,
-                            const SliceTransform* prefix_extractor,
-                            uint64_t block_offset = kNotValid,
-                            const bool no_io = false) override;
-
-  virtual void PrefixesMayMatch(MultiGetRange* range,
-                                const SliceTransform* prefix_extractor,
-                                uint64_t block_offset = kNotValid,
-                                const bool no_io = false) override;
-  virtual size_t ApproximateMemoryUsage() const override;
-  virtual bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
-                             const SliceTransform* prefix_extractor,
-                             const Comparator* comparator,
-                             const Slice* const const_ikey_ptr, bool* filter_checked,
-                             bool need_upper_bound_check) override;
- private:
-  const SliceTransform* prefix_extractor_;
-  Slice contents_;
-  std::unique_ptr<FilterBitsReader> filter_bits_reader_;
-  BlockContents block_contents_;
-  bool full_length_enabled_;
-  size_t prefix_extractor_full_length_;
-
-  // No copying allowed
-  FullFilterBlockReader(const FullFilterBlockReader&);
-  bool MayMatch(const Slice& entry);
-  void MayMatch(MultiGetRange* range);
-  void operator=(const FullFilterBlockReader&);
-  bool IsFilterCompatible(const Slice* iterate_upper_bound,
-                          const Slice& prefix, const Comparator* comparator);
-
-};
-
-}  // namespace rocksdb
diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc
deleted file mode 100644
index 3abae979a4c..00000000000
--- a/table/full_filter_block_test.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "table/full_filter_block.h"
-
-#include "rocksdb/filter_policy.h"
-#include "table/full_filter_bits_builder.h"
-#include "util/coding.h"
-#include "util/hash.h"
-#include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace rocksdb {
-
-class TestFilterBitsBuilder : public FilterBitsBuilder {
- public:
-  explicit TestFilterBitsBuilder() {}
-
-  // Add Key to filter
-  void AddKey(const Slice& key) override {
-    hash_entries_.push_back(Hash(key.data(), key.size(), 1));
-  }
-
-  // Generate the filter using the keys that are added
-  Slice Finish(std::unique_ptr<const char[]>* buf) override {
-    uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4;
-    char* data = new char[len];
-    for (size_t i = 0; i < hash_entries_.size(); i++) {
-      EncodeFixed32(data + i * 4, hash_entries_[i]);
-    }
-    const char* const_data = data;
-    buf->reset(const_data);
-    return Slice(data, len);
-  }
-
- private:
-  std::vector<uint32_t> hash_entries_;
-};
-
-class TestFilterBitsReader : public FilterBitsReader {
- public:
-  explicit TestFilterBitsReader(const Slice& contents)
-      : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {}
-
-  // Silence compiler warning about overloaded virtual
-  using FilterBitsReader::MayMatch;
-  bool MayMatch(const Slice& entry) override {
-    uint32_t h = Hash(entry.data(), entry.size(), 1);
-    for (size_t i = 0; i + 4 <= len_; i += 4) {
-      if (h == DecodeFixed32(data_ + i)) {
-        return true;
-      }
-    }
-    return false;
-  }
-
- private:
-  const char* data_;
-  uint32_t len_;
-};
-
-
-class TestHashFilter : public FilterPolicy {
- public:
-  const char* Name() const override { return "TestHashFilter"; }
-
-  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
-    for (int i = 0; i < n; i++) {
-      uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
-      PutFixed32(dst, h);
-    }
-  }
-
-  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
-    uint32_t h = Hash(key.data(), key.size(), 1);
-    for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
-      if (h == DecodeFixed32(filter.data() + i)) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  FilterBitsBuilder* GetFilterBitsBuilder() const override {
-    return new TestFilterBitsBuilder();
-  }
-
-  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
-    return new TestFilterBitsReader(contents);
-  }
-};
-
-class PluginFullFilterBlockTest : public testing::Test {
- public:
-  BlockBasedTableOptions table_options_;
-
-  PluginFullFilterBlockTest() {
-    table_options_.filter_policy.reset(new TestHashFilter());
-  }
-};
-
-TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
-  FullFilterBlockBuilder builder(
-      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
-  Slice block = builder.Finish();
-  ASSERT_EQ("", EscapeString(block));
-
-  FullFilterBlockReader reader(
-      nullptr, true, block,
-      table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  // Remain same symantic with blockbased filter
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-}
-
-TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
-  FullFilterBlockBuilder builder(
-      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
-  builder.Add("foo");
-  builder.Add("bar");
-  builder.Add("box");
-  builder.Add("box");
-  builder.Add("hello");
-  Slice block = builder.Finish();
-  FullFilterBlockReader reader(
-      nullptr, true, block,
-      table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-  ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr));
-  ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr));
-}
-
-class FullFilterBlockTest : public testing::Test {
- public:
-  BlockBasedTableOptions table_options_;
-
-  FullFilterBlockTest() {
-    table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
-  }
-
-  ~FullFilterBlockTest() override {}
-};
-
-TEST_F(FullFilterBlockTest, EmptyBuilder) {
-  FullFilterBlockBuilder builder(
-      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
-  Slice block = builder.Finish();
-  ASSERT_EQ("", EscapeString(block));
-
-  FullFilterBlockReader reader(
-      nullptr, true, block,
-      table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  // Remain same symantic with blockbased filter
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-}
-
-TEST_F(FullFilterBlockTest, DuplicateEntries) {
-  {  // empty prefixes
-    std::unique_ptr<const SliceTransform> prefix_extractor(
-        NewFixedPrefixTransform(0));
-    auto bits_builder = dynamic_cast<FullFilterBitsBuilder*>(
-        table_options_.filter_policy->GetFilterBitsBuilder());
-    const bool WHOLE_KEY = true;
-    FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
-                                   bits_builder);
-    ASSERT_EQ(0, builder.NumAdded());
-    builder.Add("key");  // test with empty prefix
-    ASSERT_EQ(2, bits_builder->hash_entries_.size());
-  }
-
-  // mix of empty and non-empty
-  std::unique_ptr<const SliceTransform> prefix_extractor(
-      NewFixedPrefixTransform(7));
-  auto bits_builder = dynamic_cast<FullFilterBitsBuilder*>(
-      table_options_.filter_policy->GetFilterBitsBuilder());
-  const bool WHOLE_KEY = true;
-  FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
-                                 bits_builder);
-  ASSERT_EQ(0, builder.NumAdded());
-  builder.Add("");  // test with empty key too
-  builder.Add("prefix1key1");
-  builder.Add("prefix1key1");
-  builder.Add("prefix1key2");
-  builder.Add("prefix1key3");
-  builder.Add("prefix2key4");
-  // two prefix adn 4 keys
-  ASSERT_EQ(1 + 2 + 4, bits_builder->hash_entries_.size());
-}
-
-TEST_F(FullFilterBlockTest, SingleChunk) {
-  FullFilterBlockBuilder builder(
-      nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
-  ASSERT_EQ(0, builder.NumAdded());
-  builder.Add("foo");
-  builder.Add("bar");
-  builder.Add("box");
-  builder.Add("box");
-  builder.Add("hello");
-  ASSERT_EQ(5, builder.NumAdded());
-  Slice block = builder.Finish();
-  FullFilterBlockReader reader(
-      nullptr, true, block,
-      table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
-  ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr));
-  ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr));
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/table/get_context.cc b/table/get_context.cc
index 24c9ba7d5b7..cdb5798f782 100644
--- a/table/get_context.cc
+++ b/table/get_context.cc
@@ -38,15 +38,13 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
 
 }  // namespace
 
-GetContext::GetContext(const Comparator* ucmp,
-                       const MergeOperator* merge_operator, Logger* logger,
-                       Statistics* statistics, GetState init_state,
-                       const Slice& user_key, PinnableSlice* pinnable_val,
-                       bool* value_found, MergeContext* merge_context,
-                       SequenceNumber* _max_covering_tombstone_seq, Env* env,
-                       SequenceNumber* seq,
-                       PinnedIteratorsManager* _pinned_iters_mgr,
-                       ReadCallback* callback, bool* is_blob_index)
+GetContext::GetContext(
+    const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
+    Statistics* statistics, GetState init_state, const Slice& user_key,
+    PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
+    bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env,
+    SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr,
+    ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id)
     : ucmp_(ucmp),
       merge_operator_(merge_operator),
       logger_(logger),
@@ -62,7 +60,9 @@ GetContext::GetContext(const Comparator* ucmp,
       replay_log_(nullptr),
       pinned_iters_mgr_(_pinned_iters_mgr),
       callback_(callback),
-      is_blob_index_(is_blob_index) {
+      do_merge_(do_merge),
+      is_blob_index_(is_blob_index),
+      tracing_get_id_(tracing_get_id) {
   if (seq_) {
     *seq_ = kMaxSequenceNumber;
   }
@@ -182,7 +182,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
   assert(matched);
   assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
          merge_context_ != nullptr);
-  if (ucmp_->Equal(parsed_key.user_key, user_key_)) {
+  if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) {
     *matched = true;
     // If the value is not in the snapshot, skip it
     if (!CheckCallback(parsed_key.sequence)) {
@@ -216,29 +216,44 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
         }
         if (kNotFound == state_) {
           state_ = kFound;
-          if (LIKELY(pinnable_val_ != nullptr)) {
-            if (LIKELY(value_pinner != nullptr)) {
-              // If the backing resources for the value are provided, pin them
-              pinnable_val_->PinSlice(value, value_pinner);
-            } else {
-              TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", this);
+          if (do_merge_) {
+            if (LIKELY(pinnable_val_ != nullptr)) {
+              if (LIKELY(value_pinner != nullptr)) {
+                // If the backing resources for the value are provided, pin them
+                pinnable_val_->PinSlice(value, value_pinner);
+              } else {
+                TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
+                                         this);
 
-              // Otherwise copy the value
-              pinnable_val_->PinSelf(value);
+                // Otherwise copy the value
+                pinnable_val_->PinSelf(value);
+              }
             }
+          } else {
+            // It means this function is called as part of DB GetMergeOperands
+            // API and the current value should be part of
+            // merge_context_->operand_list
+            push_operand(value, value_pinner);
           }
         } else if (kMerge == state_) {
           assert(merge_operator_ != nullptr);
           state_ = kFound;
-          if (LIKELY(pinnable_val_ != nullptr)) {
-            Status merge_status = MergeHelper::TimedFullMerge(
-                merge_operator_, user_key_, &value,
-                merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                logger_, statistics_, env_);
-            pinnable_val_->PinSelf();
-            if (!merge_status.ok()) {
-              state_ = kCorrupt;
+          if (do_merge_) {
+            if (LIKELY(pinnable_val_ != nullptr)) {
+              Status merge_status = MergeHelper::TimedFullMerge(
+                  merge_operator_, user_key_, &value,
+                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+                  logger_, statistics_, env_);
+              pinnable_val_->PinSelf();
+              if (!merge_status.ok()) {
+                state_ = kCorrupt;
+              }
             }
+          } else {
+            // It means this function is called as part of DB GetMergeOperands
+            // API and the current value should be part of
+            // merge_context_->operand_list
+            push_operand(value, value_pinner);
           }
         }
         if (is_blob_index_ != nullptr) {
@@ -257,14 +272,18 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
         } else if (kMerge == state_) {
           state_ = kFound;
           if (LIKELY(pinnable_val_ != nullptr)) {
-            Status merge_status = MergeHelper::TimedFullMerge(
-                merge_operator_, user_key_, nullptr,
-                merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                logger_, statistics_, env_);
-            pinnable_val_->PinSelf();
-            if (!merge_status.ok()) {
-              state_ = kCorrupt;
+            if (do_merge_) {
+              Status merge_status = MergeHelper::TimedFullMerge(
+                  merge_operator_, user_key_, nullptr,
+                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+                  logger_, statistics_, env_);
+              pinnable_val_->PinSelf();
+              if (!merge_status.ok()) {
+                state_ = kCorrupt;
+              }
             }
+            // If do_merge_ = false then the current value shouldn't be part of
+            // merge_context_->operand_list
           }
         }
         return false;
@@ -273,24 +292,23 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
         assert(state_ == kNotFound || state_ == kMerge);
         state_ = kMerge;
         // value_pinner is not set from plain_table_reader.cc for example.
-        if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
-            value_pinner != nullptr) {
-          value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
-          merge_context_->PushOperand(value, true /*value_pinned*/);
-        } else {
-          merge_context_->PushOperand(value, false);
-        }
-        if (merge_operator_ != nullptr &&
-            merge_operator_->ShouldMerge(merge_context_->GetOperandsDirectionBackward())) {
+        push_operand(value, value_pinner);
+        if (do_merge_ && merge_operator_ != nullptr &&
+            merge_operator_->ShouldMerge(
+                merge_context_->GetOperandsDirectionBackward())) {
           state_ = kFound;
           if (LIKELY(pinnable_val_ != nullptr)) {
-            Status merge_status = MergeHelper::TimedFullMerge(
-                merge_operator_, user_key_, nullptr,
-                merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                logger_, statistics_, env_);
-            pinnable_val_->PinSelf();
-            if (!merge_status.ok()) {
-              state_ = kCorrupt;
+            // do_merge_ = true this is the case where this function is called
+            // as part of DB Get API hence merge operators should be merged.
+            if (do_merge_) {
+              Status merge_status = MergeHelper::TimedFullMerge(
+                  merge_operator_, user_key_, nullptr,
+                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+                  logger_, statistics_, env_);
+              pinnable_val_->PinSelf();
+              if (!merge_status.ok()) {
+                state_ = kCorrupt;
+              }
             }
           }
           return false;
@@ -307,6 +325,16 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
   return false;
 }
 
+void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
+  if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
+      value_pinner != nullptr) {
+    value_pinner->DelegateCleanupsTo(pinned_iters_mgr());
+    merge_context_->PushOperand(value, true /*value_pinned*/);
+  } else {
+    merge_context_->PushOperand(value, false);
+  }
+}
+
 void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
                          GetContext* get_context, Cleanable* value_pinner) {
 #ifndef ROCKSDB_LITE
diff --git a/table/get_context.h b/table/get_context.h
index 7ed316f0e1a..8a2f24464bc 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -5,18 +5,21 @@
 
 #pragma once
 #include <string>
-#include <db/dbformat.h>
+#include "db/dbformat.h"
 #include "db/merge_context.h"
 #include "db/read_callback.h"
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/types.h"
-#include "table/block.h"
+#include "table/block_based/block.h"
 
 namespace rocksdb {
 class MergeContext;
 class PinnedIteratorsManager;
 
+// Data structure for accumulating statistics during a point lookup. At the
+// end of the point lookup, the corresponding ticker stats are updated. This
+// avoids the overhead of frequent ticker stats updates
 struct GetContextStats {
   uint64_t num_cache_hit = 0;
   uint64_t num_cache_index_hit = 0;
@@ -41,8 +44,17 @@ struct GetContextStats {
   uint64_t num_cache_compression_dict_bytes_insert = 0;
 };
 
+// A class to hold context about a point lookup, such as pointer to value
+// slice, key, merge context etc, as well as the current state of the
+// lookup. Any user using GetContext to track the lookup result must call
+// SaveValue() whenever the internal key is found. This can happen
+// repeatedly in case of merge operands. In case the key may exist with
+// high probability, but IO is required to confirm and the user doesn't allow
+// it, MarkKeyMayExist() must be called instead of SaveValue().
 class GetContext {
  public:
+  // Current state of the point lookup. All except kNotFound and kMerge are
+  // terminal states
   enum GetState {
     kNotFound,
     kFound,
@@ -53,24 +65,47 @@ class GetContext {
   };
   GetContextStats get_context_stats_;
 
+  // Constructor
+  // @param value Holds the value corresponding to user_key. If its nullptr
+  //              then return all merge operands corresponding to user_key
+  //              via merge_context
+  // @param value_found If non-nullptr, set to false if key may be present
+  //                    but we can't be certain because we cannot do IO
+  // @param max_covering_tombstone_seq Pointer to highest sequence number of
+  //                    range deletion covering the key. When an internal key
+  //                    is found with smaller sequence number, the lookup
+  //                    terminates
+  // @param seq If non-nullptr, the sequence number of the found key will be
+  //            saved here
+  // @param callback Pointer to ReadCallback to perform additional checks
+  //                 for visibility of a key
+  // @param is_blob_index If non-nullptr, will be used to indicate if a found
+  //                      key is of type blob index
+  // @param do_merge True if value associated with user_key has to be returned
+  // and false if all the merge operands associated with user_key has to be
+  // returned. Id do_merge=false then all the merge operands are stored in
+  // merge_context and they are never merged. The value pointer is untouched.
   GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
              Logger* logger, Statistics* statistics, GetState init_state,
              const Slice& user_key, PinnableSlice* value, bool* value_found,
-             MergeContext* merge_context,
+             MergeContext* merge_context, bool do_merge,
              SequenceNumber* max_covering_tombstone_seq, Env* env,
              SequenceNumber* seq = nullptr,
              PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
-             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr);
+             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+             uint64_t tracing_get_id = 0);
 
-  GetContext() = default;
+  GetContext() = delete;
 
+  // This can be called to indicate that a key may be present, but cannot be
+  // confirmed due to IO not allowed
   void MarkKeyMayExist();
 
   // Records this key, value, and any meta-data (such as sequence number and
   // state) into this GetContext.
   //
   // If the parsed_key matches the user key that we are looking for, sets
-  // mathced to true.
+  // matched to true.
   //
   // Returns True if more keys need to be read (due to merges) or
   //         False if the complete value has been found.
@@ -108,6 +143,12 @@ class GetContext {
 
   void ReportCounters();
 
+  bool has_callback() const { return callback_ != nullptr; }
+
+  uint64_t get_tracing_get_id() const { return tracing_get_id_; }
+
+  void push_operand(const Slice& value, Cleanable* value_pinner);
+
  private:
   const Comparator* ucmp_;
   const MergeOperator* merge_operator_;
@@ -130,9 +171,19 @@ class GetContext {
   PinnedIteratorsManager* pinned_iters_mgr_;
   ReadCallback* callback_;
   bool sample_;
+  // Value is true if it's called as part of DB Get API and false if it's
+  // called as part of DB GetMergeOperands API. When it's false merge operators
+  // are never merged.
+  bool do_merge_;
   bool* is_blob_index_;
+  // Used for block cache tracing only. A tracing get id uniquely identifies a
+  // Get or a MultiGet.
+  const uint64_t tracing_get_id_;
 };
 
+// Call this to replay a log and bring the get_context up to date. The replay
+// log must have been created by another GetContext object, whose replay log
+// must have been set by calling GetContext::SetReplayLog().
 void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
                          GetContext* get_context,
                          Cleanable* value_pinner = nullptr);
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index 6b713e7b951..d7940eeffa9 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -17,10 +17,20 @@ namespace rocksdb {
 
 class PinnedIteratorsManager;
 
+struct IterateResult {
+  Slice key;
+  bool may_be_out_of_upper_bound;
+};
+
 template <class TValue>
 class InternalIteratorBase : public Cleanable {
  public:
   InternalIteratorBase() {}
+
+  // No copying allowed
+  InternalIteratorBase(const InternalIteratorBase&) = delete;
+  InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
+
   virtual ~InternalIteratorBase() {}
 
   // An iterator is either positioned at a key/value pair, or
@@ -54,11 +64,20 @@ class InternalIteratorBase : public Cleanable {
   // REQUIRES: Valid()
   virtual void Next() = 0;
 
-  virtual bool NextAndGetResult(Slice* ret_key) {
+  // Moves to the next entry in the source, and return result. Iterator
+  // implementation should override this method to help methods inline better,
+  // or when MayBeOutOfUpperBound() is non-trivial.
+  // REQUIRES: Valid()
+  virtual bool NextAndGetResult(IterateResult* result) {
     Next();
     bool is_valid = Valid();
     if (is_valid) {
-      *ret_key = key();
+      result->key = key();
+      // Default may_be_out_of_upper_bound to true to avoid unnecessary virtual
+      // call. If an implementation has non-trivial MayBeOutOfUpperBound(),
+      // it should also override NextAndGetResult().
+      result->may_be_out_of_upper_bound = true;
+      assert(MayBeOutOfUpperBound());
     }
     return is_valid;
   }
@@ -89,10 +108,20 @@ class InternalIteratorBase : public Cleanable {
   // satisfied without doing some IO, then this returns Status::Incomplete().
   virtual Status status() const = 0;
 
-  // True if the iterator is invalidated because it is out of the iterator
-  // upper bound
+  // True if the iterator is invalidated because it reached a key that is above
+  // the iterator upper bound. Used by LevelIterator to decide whether it should
+  // stop or move on to the next file.
+  // Important: if iterator reached the end of the file without encountering any
+  // keys above the upper bound, IsOutOfBound() must return false.
   virtual bool IsOutOfBound() { return false; }
 
+  // Keys return from this iterator can be smaller than iterate_lower_bound.
+  virtual bool MayBeOutOfLowerBound() { return true; }
+
+  // Keys return from this iterator can be larger or equal to
+  // iterate_upper_bound.
+  virtual bool MayBeOutOfUpperBound() { return true; }
+
   // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
   // communicate with PinnedIteratorsManager so default implementation is no-op
   // but for Iterators that need to communicate with PinnedIteratorsManager
@@ -131,10 +160,7 @@ class InternalIteratorBase : public Cleanable {
     }
   }
 
- private:
-  // No copying allowed
-  InternalIteratorBase(const InternalIteratorBase&) = delete;
-  InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
+  bool is_mutable_;
 };
 
 using InternalIterator = InternalIteratorBase<Slice>;
diff --git a/table/iterator.cc b/table/iterator.cc
index 0475b9d1342..f6c7f9cec3f 100644
--- a/table/iterator.cc
+++ b/table/iterator.cc
@@ -8,9 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/iterator.h"
+#include "memory/arena.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 
@@ -167,7 +167,7 @@ template <class TValue>
 InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status) {
   return new EmptyInternalIterator<TValue>(status);
 }
-template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
     const Status& status);
 template InternalIteratorBase<Slice>* NewErrorInternalIterator(
     const Status& status);
@@ -182,7 +182,7 @@ InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status,
     return new (mem) EmptyInternalIterator<TValue>(status);
   }
 }
-template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
+template InternalIteratorBase<IndexValue>* NewErrorInternalIterator(
     const Status& status, Arena* arena);
 template InternalIteratorBase<Slice>* NewErrorInternalIterator(
     const Status& status, Arena* arena);
@@ -191,7 +191,7 @@ template <class TValue>
 InternalIteratorBase<TValue>* NewEmptyInternalIterator() {
   return new EmptyInternalIterator<TValue>(Status::OK());
 }
-template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator();
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator();
 template InternalIteratorBase<Slice>* NewEmptyInternalIterator();
 
 template <class TValue>
@@ -203,7 +203,7 @@ InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena) {
     return new (mem) EmptyInternalIterator<TValue>(Status::OK());
   }
 }
-template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator(
+template InternalIteratorBase<IndexValue>* NewEmptyInternalIterator(
     Arena* arena);
 template InternalIteratorBase<Slice>* NewEmptyInternalIterator(Arena* arena);
 
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index fc5eb2613d8..f8fdde565ec 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -12,6 +12,7 @@
 #include <set>
 
 #include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
@@ -56,7 +57,10 @@ class IteratorWrapperBase {
 
   // Iterator interface methods
   bool Valid() const        { return valid_; }
-  Slice key() const         { assert(Valid()); return key_; }
+  Slice key() const {
+    assert(Valid());
+    return result_.key;
+  }
   TValue value() const {
     assert(Valid());
     return iter_->value();
@@ -65,11 +69,15 @@ class IteratorWrapperBase {
   Status status() const     { assert(iter_); return iter_->status(); }
   void Next() {
     assert(iter_);
-    valid_ = iter_->NextAndGetResult(&key_);
+    valid_ = iter_->NextAndGetResult(&result_);
     assert(!valid_ || iter_->status().ok());
   }
   void Prev()               { assert(iter_); iter_->Prev();        Update(); }
-  void Seek(const Slice& k) { assert(iter_); iter_->Seek(k);       Update(); }
+  void Seek(const Slice& k) {
+    assert(iter_);
+    iter_->Seek(k);
+    Update();
+  }
   void SeekForPrev(const Slice& k) {
     assert(iter_);
     iter_->SeekForPrev(k);
@@ -78,6 +86,16 @@ class IteratorWrapperBase {
   void SeekToFirst()        { assert(iter_); iter_->SeekToFirst(); Update(); }
   void SeekToLast()         { assert(iter_); iter_->SeekToLast();  Update(); }
 
+  bool MayBeOutOfLowerBound() {
+    assert(Valid());
+    return iter_->MayBeOutOfLowerBound();
+  }
+
+  bool MayBeOutOfUpperBound() {
+    assert(Valid());
+    return result_.may_be_out_of_upper_bound;
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {
     assert(iter_);
     iter_->SetPinnedItersMgr(pinned_iters_mgr);
@@ -95,14 +113,15 @@ class IteratorWrapperBase {
   void Update() {
     valid_ = iter_->Valid();
     if (valid_) {
-      key_ = iter_->key();
       assert(iter_->status().ok());
+      result_.key = iter_->key();
+      result_.may_be_out_of_upper_bound = true;
     }
   }
 
   InternalIteratorBase<TValue>* iter_;
+  IterateResult result_;
   bool valid_;
-  Slice key_;
 };
 
 using IteratorWrapper = IteratorWrapperBase<Slice>;
diff --git a/table/merger_test.cc b/table/merger_test.cc
index 1b04d065727..8efa2834db6 100644
--- a/table/merger_test.cc
+++ b/table/merger_test.cc
@@ -7,8 +7,8 @@
 #include <string>
 
 #include "table/merging_iterator.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index bd4a186b3c2..2ee379b052e 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -12,6 +12,7 @@
 #include <vector>
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
@@ -19,11 +20,10 @@
 #include "table/internal_iterator.h"
 #include "table/iter_heap.h"
 #include "table/iterator_wrapper.h"
-#include "util/arena.h"
+#include "test_util/sync_point.h"
 #include "util/autovector.h"
 #include "util/heap.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
@@ -51,12 +51,7 @@ class MergingIterator : public InternalIterator {
       children_[i].Set(children[i]);
     }
     for (auto& child : children_) {
-      if (child.Valid()) {
-        assert(child.status().ok());
-        minHeap_.push(&child);
-      } else {
-        considerStatus(child.status());
-      }
+      AddToMinHeapOrCheckStatus(&child);
     }
     current_ = CurrentForward();
   }
@@ -74,12 +69,9 @@ class MergingIterator : public InternalIterator {
       iter->SetPinnedItersMgr(pinned_iters_mgr_);
     }
     auto new_wrapper = children_.back();
+    AddToMinHeapOrCheckStatus(&new_wrapper);
     if (new_wrapper.Valid()) {
-      assert(new_wrapper.status().ok());
-      minHeap_.push(&new_wrapper);
       current_ = CurrentForward();
-    } else {
-      considerStatus(new_wrapper.status());
     }
   }
 
@@ -98,12 +90,7 @@ class MergingIterator : public InternalIterator {
     status_ = Status::OK();
     for (auto& child : children_) {
       child.SeekToFirst();
-      if (child.Valid()) {
-        assert(child.status().ok());
-        minHeap_.push(&child);
-      } else {
-        considerStatus(child.status());
-      }
+      AddToMinHeapOrCheckStatus(&child);
     }
     direction_ = kForward;
     current_ = CurrentForward();
@@ -115,12 +102,7 @@ class MergingIterator : public InternalIterator {
     status_ = Status::OK();
     for (auto& child : children_) {
       child.SeekToLast();
-      if (child.Valid()) {
-        assert(child.status().ok());
-        maxHeap_->push(&child);
-      } else {
-        considerStatus(child.status());
-      }
+      AddToMaxHeapOrCheckStatus(&child);
     }
     direction_ = kReverse;
     current_ = CurrentReverse();
@@ -134,14 +116,13 @@ class MergingIterator : public InternalIterator {
         PERF_TIMER_GUARD(seek_child_seek_time);
         child.Seek(target);
       }
-      PERF_COUNTER_ADD(seek_child_seek_count, 1);
 
-      if (child.Valid()) {
-        assert(child.status().ok());
+      PERF_COUNTER_ADD(seek_child_seek_count, 1);
+      {
+        // Strictly, we timed slightly more than min heap operation,
+        // but these operations are very cheap.
         PERF_TIMER_GUARD(seek_min_heap_time);
-        minHeap_.push(&child);
-      } else {
-        considerStatus(child.status());
+        AddToMinHeapOrCheckStatus(&child);
       }
     }
     direction_ = kForward;
@@ -163,12 +144,9 @@ class MergingIterator : public InternalIterator {
       }
       PERF_COUNTER_ADD(seek_child_seek_count, 1);
 
-      if (child.Valid()) {
-        assert(child.status().ok());
+      {
         PERF_TIMER_GUARD(seek_max_heap_time);
-        maxHeap_->push(&child);
-      } else {
-        considerStatus(child.status());
+        AddToMaxHeapOrCheckStatus(&child);
       }
     }
     direction_ = kReverse;
@@ -212,6 +190,16 @@ class MergingIterator : public InternalIterator {
     current_ = CurrentForward();
   }
 
+  bool NextAndGetResult(IterateResult* result) override {
+    Next();
+    bool is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      result->may_be_out_of_upper_bound = MayBeOutOfUpperBound();
+    }
+    return is_valid;
+  }
+
   void Prev() override {
     assert(Valid());
     // Ensure that all children are positioned before key().
@@ -221,35 +209,7 @@ class MergingIterator : public InternalIterator {
     if (direction_ != kReverse) {
       // Otherwise, retreat the non-current children.  We retreat current_
       // just after the if-block.
-      ClearHeaps();
-      InitMaxHeap();
-      Slice target = key();
-      for (auto& child : children_) {
-        if (&child != current_) {
-          child.SeekForPrev(target);
-          TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
-          considerStatus(child.status());
-          if (child.Valid() && comparator_->Equal(target, child.key())) {
-            child.Prev();
-            considerStatus(child.status());
-          }
-        }
-        if (child.Valid()) {
-          assert(child.status().ok());
-          maxHeap_->push(&child);
-        }
-      }
-      direction_ = kReverse;
-      if (!prefix_seek_mode_) {
-        // Note that we don't do assert(current_ == CurrentReverse()) here
-        // because it is possible to have some keys larger than the seek-key
-        // inserted between Seek() and SeekToLast(), which makes current_ not
-        // equal to CurrentReverse().
-        current_ = CurrentReverse();
-      }
-      // The loop advanced all non-current children to be < key() so current_
-      // should still be strictly the smallest key.
-      assert(current_ == CurrentReverse());
+      SwitchToBackward();
     }
 
     // For the heap modifications below to be correct, current_ must be the
@@ -281,6 +241,20 @@ class MergingIterator : public InternalIterator {
     return current_->value();
   }
 
+  // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
+  // from current child iterator. Potentially as long as one of child iterator
+  // report out of bound is not possible, we know current key is within bound.
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return current_->MayBeOutOfLowerBound();
+  }
+
+  bool MayBeOutOfUpperBound() override {
+    assert(Valid());
+    return current_->MayBeOutOfUpperBound();
+  }
+
   void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
     for (auto& child : children_) {
@@ -331,8 +305,20 @@ class MergingIterator : public InternalIterator {
   std::unique_ptr<MergerMaxIterHeap> maxHeap_;
   PinnedIteratorsManager* pinned_iters_mgr_;
 
+  // In forward direction, process a child that is not in the min heap.
+  // If valid, add to the min heap. Otherwise, check status.
+  void AddToMinHeapOrCheckStatus(IteratorWrapper*);
+
+  // In backward direction, process a child that is not in the max heap.
+  // If valid, add to the min heap. Otherwise, check status.
+  void AddToMaxHeapOrCheckStatus(IteratorWrapper*);
+
   void SwitchToForward();
 
+  // Switch the direction from forward to backward without changing the
+  // position. Iterator should still be valid.
+  void SwitchToBackward();
+
   IteratorWrapper* CurrentForward() const {
     assert(direction_ == kForward);
     return !minHeap_.empty() ? minHeap_.top() : nullptr;
@@ -345,6 +331,24 @@ class MergingIterator : public InternalIterator {
   }
 };
 
+void MergingIterator::AddToMinHeapOrCheckStatus(IteratorWrapper* child) {
+  if (child->Valid()) {
+    assert(child->status().ok());
+    minHeap_.push(child);
+  } else {
+    considerStatus(child->status());
+  }
+}
+
+void MergingIterator::AddToMaxHeapOrCheckStatus(IteratorWrapper* child) {
+  if (child->Valid()) {
+    assert(child->status().ok());
+    maxHeap_->push(child);
+  } else {
+    considerStatus(child->status());
+  }
+}
+
 void MergingIterator::SwitchToForward() {
   // Otherwise, advance the non-current children.  We advance current_
   // just after the if-block.
@@ -353,19 +357,42 @@ void MergingIterator::SwitchToForward() {
   for (auto& child : children_) {
     if (&child != current_) {
       child.Seek(target);
-      considerStatus(child.status());
       if (child.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.status().ok());
         child.Next();
-        considerStatus(child.status());
       }
     }
-    if (child.Valid()) {
-      minHeap_.push(&child);
-    }
+    AddToMinHeapOrCheckStatus(&child);
   }
   direction_ = kForward;
 }
 
+void MergingIterator::SwitchToBackward() {
+  ClearHeaps();
+  InitMaxHeap();
+  Slice target = key();
+  for (auto& child : children_) {
+    if (&child != current_) {
+      child.SeekForPrev(target);
+      TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
+      if (child.Valid() && comparator_->Equal(target, child.key())) {
+        assert(child.status().ok());
+        child.Prev();
+      }
+    }
+    AddToMaxHeapOrCheckStatus(&child);
+  }
+  direction_ = kReverse;
+  if (!prefix_seek_mode_) {
+    // Note that we don't do assert(current_ == CurrentReverse()) here
+    // because it is possible to have some keys larger than the seek-key
+    // inserted between Seek() and SeekToLast(), which makes current_ not
+    // equal to CurrentReverse().
+    current_ = CurrentReverse();
+  }
+  assert(current_ == CurrentReverse());
+}
+
 void MergingIterator::ClearHeaps() {
   minHeap_.clear();
   if (maxHeap_) {
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 3f48095c55b..1ba52d6e1c7 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -7,18 +7,18 @@
 #include <map>
 #include <string>
 
+#include "block_fetcher.h"
 #include "db/table_properties_collector.h"
+#include "file/random_access_file_reader.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-#include "table/block.h"
-#include "table/block_fetcher.h"
+#include "table/block_based/block.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
 #include "table/persistent_cache_helper.h"
 #include "table/table_properties_internal.h"
+#include "test_util/sync_point.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -216,7 +216,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
   BlockFetcher block_fetcher(
       file, prefetch_buffer, footer, read_options, handle, &block_contents,
       ioptions, false /* decompress */, false /*maybe_compressed*/,
-      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+      BlockType::kProperties, UncompressionDict::GetEmptyDict(), cache_options,
+      memory_allocator);
   s = block_fetcher.ReadBlockContents();
   // property block is never compressed. Need to add uncompress logic if we are
   // to compress it..
@@ -228,8 +229,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
   Block properties_block(std::move(block_contents),
                          kDisableGlobalSequenceNumber);
   DataBlockIter iter;
-  properties_block.NewIterator<DataBlockIter>(BytewiseComparator(),
-                                              BytewiseComparator(), &iter);
+  properties_block.NewDataIterator(BytewiseComparator(), BytewiseComparator(),
+                                   &iter);
 
   auto new_table_properties = new TableProperties();
   // All pre-defined properties of type uint64_t
@@ -375,8 +376,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
   BlockFetcher block_fetcher(
       file, nullptr /* prefetch_buffer */, footer, read_options,
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
-      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-      cache_options, memory_allocator);
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
   s = block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
@@ -385,9 +386,8 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
   // are to compress it.
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);
-  std::unique_ptr<InternalIterator> meta_iter(
-      metaindex_block.NewIterator<DataBlockIter>(BytewiseComparator(),
-                                                 BytewiseComparator()));
+  std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
+      BytewiseComparator(), BytewiseComparator()));
 
   // -- Read property block
   bool found_properties_block = true;
@@ -446,7 +446,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
       file, nullptr /* prefetch_buffer */, footer, read_options,
       metaindex_handle, &metaindex_contents, ioptions,
       false /* do decompression */, false /*maybe_compressed*/,
-      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+      BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), cache_options,
+      memory_allocator);
   s = block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
@@ -457,8 +458,8 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                         kDisableGlobalSequenceNumber);
 
   std::unique_ptr<InternalIterator> meta_iter;
-  meta_iter.reset(metaindex_block.NewIterator<DataBlockIter>(
-      BytewiseComparator(), BytewiseComparator()));
+  meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(),
+                                                  BytewiseComparator()));
 
   return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
 }
@@ -467,7 +468,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                      FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
                      const ImmutableCFOptions& ioptions,
-                     const std::string& meta_block_name,
+                     const std::string& meta_block_name, BlockType block_type,
                      BlockContents* contents, bool /*compression_type_missing*/,
                      MemoryAllocator* memory_allocator) {
   Status status;
@@ -485,11 +486,11 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   read_options.verify_checksums = false;
   PersistentCacheOptions cache_options;
 
-  BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options,
-                             metaindex_handle, &metaindex_contents, ioptions,
-                             false /* decompress */, false /*maybe_compressed*/,
-                             UncompressionDict::GetEmptyDict(), cache_options,
-                             memory_allocator);
+  BlockFetcher block_fetcher(
+      file, prefetch_buffer, footer, read_options, metaindex_handle,
+      &metaindex_contents, ioptions, false /* decompress */,
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
   status = block_fetcher.ReadBlockContents();
   if (!status.ok()) {
     return status;
@@ -502,8 +503,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                         kDisableGlobalSequenceNumber);
 
   std::unique_ptr<InternalIterator> meta_iter;
-  meta_iter.reset(metaindex_block.NewIterator<DataBlockIter>(
-      BytewiseComparator(), BytewiseComparator()));
+  meta_iter.reset(metaindex_block.NewDataIterator(BytewiseComparator(),
+                                                  BytewiseComparator()));
 
   BlockHandle block_handle;
   status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
@@ -515,7 +516,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   // Reading metablock
   BlockFetcher block_fetcher2(
       file, prefetch_buffer, footer, read_options, block_handle, contents,
-      ioptions, false /* decompress */, false /*maybe_compressed*/,
+      ioptions, false /* decompress */, false /*maybe_compressed*/, block_type,
       UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
   return block_fetcher2.ReadBlockContents();
 }
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index 6efd1225e19..63d66497f63 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -15,7 +15,8 @@
 #include "rocksdb/memory_allocator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/block_type.h"
 #include "table/format.h"
 #include "util/kv_map.h"
 
@@ -88,7 +89,7 @@ void NotifyCollectTableCollectorsOnBlockAdd(
     uint64_t blockRawBytes, uint64_t blockCompressedBytesFast,
     uint64_t blockCompressedBytesSlow);
 
-// NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
+// NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all
 // property collectors. The collected properties will be added to `builder`.
 bool NotifyCollectTableCollectorsOnFinish(
     const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
@@ -143,7 +144,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                      FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
                      const ImmutableCFOptions& ioptions,
-                     const std::string& meta_block_name,
+                     const std::string& meta_block_name, BlockType block_type,
                      BlockContents* contents,
                      bool compression_type_missing = false,
                      MemoryAllocator* memory_allocator = nullptr);
diff --git a/table/mock_table.cc b/table/mock_table.cc
index 9b250604803..50cbb202475 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -6,11 +6,11 @@
 #include "table/mock_table.h"
 
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
 #include "port/port.h"
 #include "rocksdb/table_properties.h"
 #include "table/get_context.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 namespace mock {
@@ -21,12 +21,6 @@ const InternalKeyComparator icmp_(BytewiseComparator());
 
 }  // namespace
 
-stl_wrappers::KVMap MakeMockFile(
-    std::vector<std::pair<const std::string, std::string>> l) {
-  return stl_wrappers::KVMap(l.begin(), l.end(),
-                             stl_wrappers::LessOfComparator(&icmp_));
-}
-
 stl_wrappers::KVMap MakeMockFile(
     std::initializer_list<std::pair<const std::string, std::string>> l) {
   return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_));
@@ -34,7 +28,8 @@ stl_wrappers::KVMap MakeMockFile(
 
 InternalIterator* MockTableReader::NewIterator(
     const ReadOptions&, const SliceTransform* /* prefix_extractor */,
-    Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/) {
+    Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/) {
   return new MockTableIterator(table_);
 }
 
@@ -143,14 +138,6 @@ void MockTableFactory::AssertLatestFile(
       ParseInternalKey(Slice(key), &ikey);
       std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
     }
-    std::cout << "Expected:" << std::endl;
-    for (const auto& kv : file_contents) {
-      ParsedInternalKey ikey;
-      std::string key, value;
-      std::tie(key, value) = kv;
-      ParseInternalKey(Slice(key), &ikey);
-      std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
-    }
     FAIL();
   }
 }
diff --git a/table/mock_table.h b/table/mock_table.h
index 5bca14644d8..81d178810f2 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -12,24 +12,22 @@
 #include <string>
 #include <utility>
 
-#include "util/kv_map.h"
 #include "port/port.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/table.h"
 #include "table/internal_iterator.h"
 #include "table/table_builder.h"
 #include "table/table_reader.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/kv_map.h"
 #include "util/mutexlock.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 namespace mock {
 
 stl_wrappers::KVMap MakeMockFile(
     std::initializer_list<std::pair<const std::string, std::string>> l = {});
-stl_wrappers::KVMap MakeMockFile(
-    std::vector<std::pair<const std::string, std::string>> l);
 
 struct MockTableFileSystem {
   port::Mutex mutex;
@@ -42,17 +40,25 @@ class MockTableReader : public TableReader {
 
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
-                                Arena* arena = nullptr,
-                                bool skip_filters = false,
-                                bool for_compaction = false) override;
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0) override;
 
   Status Get(const ReadOptions& readOptions, const Slice& key,
              GetContext* get_context, const SliceTransform* prefix_extractor,
              bool skip_filters = false) override;
 
-  uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; }
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/,
+                               TableReaderCaller /*caller*/) override {
+    return 0;
+  }
 
-  virtual size_t ApproximateMemoryUsage() const override { return 0; }
+  uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
+                           TableReaderCaller /*caller*/) override {
+    return 0;
+  }
+
+  size_t ApproximateMemoryUsage() const override { return 0; }
 
   void SetupForCompaction() override {}
 
@@ -186,12 +192,6 @@ class MockTableFactory : public TableFactory {
   // contents are equal to file_contents
   void AssertSingleFile(const stl_wrappers::KVMap& file_contents);
   void AssertLatestFile(const stl_wrappers::KVMap& file_contents);
-  stl_wrappers::KVMap output() {
-    assert(!file_system_.files.empty());
-    auto latest = file_system_.files.end();
-    --latest;
-    return latest->second;
-  }
 
  private:
   uint32_t GetAndWriteNextID(WritableFileWriter* file) const;
diff --git a/table/multiget_context.h b/table/multiget_context.h
index d3a8d09463b..8b5b607b3bf 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -5,6 +5,7 @@
 
 #pragma once
 #include <algorithm>
+#include <array>
 #include <string>
 #include "db/lookup_key.h"
 #include "db/merge_context.h"
@@ -21,22 +22,23 @@ struct KeyContext {
   LookupKey* lkey;
   Slice ukey;
   Slice ikey;
+  ColumnFamilyHandle* column_family;
   Status* s;
   MergeContext merge_context;
   SequenceNumber max_covering_tombstone_seq;
   bool key_exists;
-  SequenceNumber seq;
   void* cb_arg;
   PinnableSlice* value;
   GetContext* get_context;
 
-  KeyContext(const Slice& user_key, PinnableSlice* val, Status* stat)
+  KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key,
+             PinnableSlice* val, Status* stat)
       : key(&user_key),
         lkey(nullptr),
+        column_family(col_family),
         s(stat),
         max_covering_tombstone_seq(0),
         key_exists(false),
-        seq(0),
         cb_arg(nullptr),
         value(val),
         get_context(nullptr) {}
@@ -87,10 +89,9 @@ class MultiGetContext {
   // htat need to be performed
   static const int MAX_BATCH_SIZE = 32;
 
-  MultiGetContext(KeyContext** sorted_keys, size_t num_keys,
-                  SequenceNumber snapshot)
-      : sorted_keys_(sorted_keys),
-        num_keys_(num_keys),
+  MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
+                  size_t begin, size_t num_keys, SequenceNumber snapshot)
+      : num_keys_(num_keys),
         value_mask_(0),
         lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf)) {
     int index = 0;
@@ -102,6 +103,8 @@ class MultiGetContext {
     }
 
     for (size_t iter = 0; iter != num_keys_; ++iter) {
+      // autovector may not be contiguous storage, so make a copy
+      sorted_keys_[iter] = (*sorted_keys)[begin + iter];
       sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[index])
           LookupKey(*sorted_keys_[iter]->key, snapshot);
       sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key();
@@ -120,10 +123,10 @@ class MultiGetContext {
   static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
   alignas(alignof(LookupKey))
     char lookup_key_stack_buf[sizeof(LookupKey) * MAX_LOOKUP_KEYS_ON_STACK];
-  KeyContext** sorted_keys_;
+  std::array<KeyContext*, MAX_BATCH_SIZE> sorted_keys_;
   size_t num_keys_;
   uint64_t value_mask_;
-  std::unique_ptr<char> lookup_key_heap_buf;
+  std::unique_ptr<char[]> lookup_key_heap_buf;
   LookupKey* lookup_key_ptr_;
 
  public:
@@ -231,6 +234,16 @@ class MultiGetContext {
       return ctx_->value_mask_ & (1ull << iter.index_);
     }
 
+    uint64_t KeysLeft() {
+      uint64_t new_val = skip_mask_ | ctx_->value_mask_;
+      uint64_t count = 0;
+      while (new_val) {
+        new_val = new_val & (new_val - 1);
+        count++;
+      }
+      return end_ - count;
+    }
+
    private:
     friend MultiGetContext;
     MultiGetContext* ctx_;
diff --git a/table/partitioned_filter_block.cc b/table/partitioned_filter_block.cc
deleted file mode 100644
index aab0f5509b9..00000000000
--- a/table/partitioned_filter_block.cc
+++ /dev/null
@@ -1,355 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "table/partitioned_filter_block.h"
-
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else
-#include <malloc.h>
-#endif
-#endif
-#include <utility>
-
-#include "monitoring/perf_context_imp.h"
-#include "port/port.h"
-#include "rocksdb/filter_policy.h"
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
-#include "util/coding.h"
-
-namespace rocksdb {
-
-PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
-    const SliceTransform* prefix_extractor, bool whole_key_filtering,
-    FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
-    const bool use_value_delta_encoding,
-    PartitionedIndexBuilder* const p_index_builder,
-    const uint32_t partition_size)
-    : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering,
-                             filter_bits_builder),
-      index_on_filter_block_builder_(index_block_restart_interval,
-                                     true /*use_delta_encoding*/,
-                                     use_value_delta_encoding),
-      index_on_filter_block_builder_without_seq_(index_block_restart_interval,
-                                                 true /*use_delta_encoding*/,
-                                                 use_value_delta_encoding),
-      p_index_builder_(p_index_builder),
-      filters_in_partition_(0),
-      num_added_(0) {
-  filters_per_partition_ =
-      filter_bits_builder_->CalculateNumEntry(partition_size);
-}
-
-PartitionedFilterBlockBuilder::~PartitionedFilterBlockBuilder() {}
-
-void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
-  // Use == to send the request only once
-  if (filters_in_partition_ == filters_per_partition_) {
-    // Currently only index builder is in charge of cutting a partition. We keep
-    // requesting until it is granted.
-    p_index_builder_->RequestPartitionCut();
-  }
-  if (!p_index_builder_->ShouldCutFilterBlock()) {
-    return;
-  }
-  filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));
-  Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
-  std::string& index_key = p_index_builder_->GetPartitionKey();
-  filters.push_back({index_key, filter});
-  filters_in_partition_ = 0;
-  Reset();
-}
-
-void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
-  MaybeCutAFilterBlock();
-  filter_bits_builder_->AddKey(key);
-  filters_in_partition_++;
-  num_added_++;
-}
-
-Slice PartitionedFilterBlockBuilder::Finish(
-    const BlockHandle& last_partition_block_handle, Status* status) {
-  if (finishing_filters == true) {
-    // Record the handle of the last written filter block in the index
-    FilterEntry& last_entry = filters.front();
-    std::string handle_encoding;
-    last_partition_block_handle.EncodeTo(&handle_encoding);
-    std::string handle_delta_encoding;
-    PutVarsignedint64(
-        &handle_delta_encoding,
-        last_partition_block_handle.size() - last_encoded_handle_.size());
-    last_encoded_handle_ = last_partition_block_handle;
-    const Slice handle_delta_encoding_slice(handle_delta_encoding);
-    index_on_filter_block_builder_.Add(last_entry.key, handle_encoding,
-                                       &handle_delta_encoding_slice);
-    if (!p_index_builder_->seperator_is_key_plus_seq()) {
-      index_on_filter_block_builder_without_seq_.Add(
-          ExtractUserKey(last_entry.key), handle_encoding,
-          &handle_delta_encoding_slice);
-    }
-    filters.pop_front();
-  } else {
-    MaybeCutAFilterBlock();
-  }
-  // If there is no filter partition left, then return the index on filter
-  // partitions
-  if (UNLIKELY(filters.empty())) {
-    *status = Status::OK();
-    if (finishing_filters) {
-      if (p_index_builder_->seperator_is_key_plus_seq()) {
-        return index_on_filter_block_builder_.Finish();
-      } else {
-        return index_on_filter_block_builder_without_seq_.Finish();
-      }
-    } else {
-      // This is the rare case where no key was added to the filter
-      return Slice();
-    }
-  } else {
-    // Return the next filter partition in line and set Incomplete() status to
-    // indicate we expect more calls to Finish
-    *status = Status::Incomplete();
-    finishing_filters = true;
-    return filters.front().filter;
-  }
-}
-
-PartitionedFilterBlockReader::PartitionedFilterBlockReader(
-    const SliceTransform* prefix_extractor, bool _whole_key_filtering,
-    BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/,
-    Statistics* stats, const InternalKeyComparator comparator,
-    const BlockBasedTable* table, const bool index_key_includes_seq,
-    const bool index_value_is_full)
-    : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
-      prefix_extractor_(prefix_extractor),
-      comparator_(comparator),
-      table_(table),
-      index_key_includes_seq_(index_key_includes_seq),
-      index_value_is_full_(index_value_is_full) {
-  idx_on_fltr_blk_.reset(new Block(std::move(contents),
-                                   kDisableGlobalSequenceNumber,
-                                   0 /* read_amp_bytes_per_bit */, stats));
-}
-
-PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
-  // TODO(myabandeh): if instead of filter object we store only the blocks in
-  // block cache, then we don't have to manually earse them from block cache
-  // here.
-  auto block_cache = table_->rep_->table_options.block_cache.get();
-  if (UNLIKELY(block_cache == nullptr)) {
-    return;
-  }
-  char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  IndexBlockIter biter;
-  BlockHandle handle;
-  Statistics* kNullStats = nullptr;
-  idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
-      &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
-      index_key_includes_seq_, index_value_is_full_);
-  biter.SeekToFirst();
-  for (; biter.Valid(); biter.Next()) {
-    handle = biter.value();
-    auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix,
-                                            table_->rep_->cache_key_prefix_size,
-                                            handle, cache_key);
-    block_cache->Erase(key);
-  }
-}
-
-bool PartitionedFilterBlockReader::KeyMayMatch(
-    const Slice& key, const SliceTransform* prefix_extractor,
-    uint64_t block_offset, const bool no_io,
-    const Slice* const const_ikey_ptr) {
-  assert(const_ikey_ptr != nullptr);
-  assert(block_offset == kNotValid);
-  if (!whole_key_filtering_) {
-    return true;
-  }
-  if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) {
-    return true;
-  }
-  auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr);
-  if (UNLIKELY(filter_handle.size() == 0)) {  // key is out of range
-    return false;
-  }
-  bool cached = false;
-  auto filter_partition =
-      GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
-                         &cached, prefix_extractor);
-  if (UNLIKELY(!filter_partition.value)) {
-    return true;
-  }
-  auto res = filter_partition.value->KeyMayMatch(key, prefix_extractor,
-                                                 block_offset, no_io);
-  if (cached) {
-    return res;
-  }
-  if (LIKELY(filter_partition.IsSet())) {
-    filter_partition.Release(table_->rep_->table_options.block_cache.get());
-  } else {
-    delete filter_partition.value;
-  }
-  return res;
-}
-
-bool PartitionedFilterBlockReader::PrefixMayMatch(
-    const Slice& prefix, const SliceTransform* prefix_extractor,
-    uint64_t block_offset, const bool no_io,
-    const Slice* const const_ikey_ptr) {
-#ifdef NDEBUG
-  (void)block_offset;
-#endif
-  assert(const_ikey_ptr != nullptr);
-  assert(block_offset == kNotValid);
-  if (!prefix_extractor_ && !prefix_extractor) {
-    return true;
-  }
-  if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) {
-    return true;
-  }
-  auto filter_handle = GetFilterPartitionHandle(*const_ikey_ptr);
-  if (UNLIKELY(filter_handle.size() == 0)) {  // prefix is out of range
-    return false;
-  }
-  bool cached = false;
-  auto filter_partition =
-      GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
-                         &cached, prefix_extractor);
-  if (UNLIKELY(!filter_partition.value)) {
-    return true;
-  }
-  auto res = filter_partition.value->PrefixMayMatch(prefix, prefix_extractor,
-                                                    kNotValid, no_io);
-  if (cached) {
-    return res;
-  }
-  if (LIKELY(filter_partition.IsSet())) {
-    filter_partition.Release(table_->rep_->table_options.block_cache.get());
-  } else {
-    delete filter_partition.value;
-  }
-  return res;
-}
-
-BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
-    const Slice& entry) {
-  IndexBlockIter iter;
-  Statistics* kNullStats = nullptr;
-  idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
-      &comparator_, comparator_.user_comparator(), &iter, kNullStats, true,
-      index_key_includes_seq_, index_value_is_full_);
-  iter.Seek(entry);
-  if (UNLIKELY(!iter.Valid())) {
-    return BlockHandle(0, 0);
-  }
-  assert(iter.Valid());
-  BlockHandle fltr_blk_handle = iter.value();
-  return fltr_blk_handle;
-}
-
-BlockBasedTable::CachableEntry<FilterBlockReader>
-PartitionedFilterBlockReader::GetFilterPartition(
-    FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle,
-    const bool no_io, bool* cached, const SliceTransform* prefix_extractor) {
-  const bool is_a_filter_partition = true;
-  auto block_cache = table_->rep_->table_options.block_cache.get();
-  if (LIKELY(block_cache != nullptr)) {
-    if (filter_map_.size() != 0) {
-      auto iter = filter_map_.find(fltr_blk_handle.offset());
-      // This is a possible scenario since block cache might not have had space
-      // for the partition
-      if (iter != filter_map_.end()) {
-        PERF_COUNTER_ADD(block_cache_hit_count, 1);
-        RecordTick(statistics(), BLOCK_CACHE_FILTER_HIT);
-        RecordTick(statistics(), BLOCK_CACHE_HIT);
-        RecordTick(statistics(), BLOCK_CACHE_BYTES_READ,
-                   block_cache->GetUsage(iter->second.cache_handle));
-        *cached = true;
-        return iter->second;
-      }
-    }
-    return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle,
-                             is_a_filter_partition, no_io,
-                             /* get_context */ nullptr, prefix_extractor);
-  } else {
-    auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle,
-                                     is_a_filter_partition, prefix_extractor);
-    return {filter, nullptr};
-  }
-}
-
-size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
-  size_t usage = idx_on_fltr_blk_->usable_size();
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-  usage += malloc_usable_size((void*)this);
-#else
-  usage += sizeof(*this);
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-  return usage;
-  // TODO(myabandeh): better estimation for filter_map_ size
-}
-
-// Release the cached entry and decrement its ref count.
-void ReleaseFilterCachedEntry(void* arg, void* h) {
-  Cache* cache = reinterpret_cast<Cache*>(arg);
-  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
-  cache->Release(handle);
-}
-
-// TODO(myabandeh): merge this with the same function in IndexReader
-void PartitionedFilterBlockReader::CacheDependencies(
-    bool pin, const SliceTransform* prefix_extractor) {
-  // Before read partitions, prefetch them to avoid lots of IOs
-  auto rep = table_->rep_;
-  IndexBlockIter biter;
-  Statistics* kNullStats = nullptr;
-  idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
-      &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
-      index_key_includes_seq_, index_value_is_full_);
-  // Index partitions are assumed to be consecuitive. Prefetch them all.
-  // Read the first block offset
-  biter.SeekToFirst();
-  BlockHandle handle = biter.value();
-  uint64_t prefetch_off = handle.offset();
-
-  // Read the last block's offset
-  biter.SeekToLast();
-  handle = biter.value();
-  uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
-  uint64_t prefetch_len = last_off - prefetch_off;
-  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
-  auto& file = table_->rep_->file;
-  prefetch_buffer.reset(new FilePrefetchBuffer());
-  Status s;
-  s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
-    static_cast<size_t>(prefetch_len));
-
-  // After prefetch, read the partitions one by one
-  biter.SeekToFirst();
-  Cache* block_cache = rep->table_options.block_cache.get();
-  for (; biter.Valid(); biter.Next()) {
-    handle = biter.value();
-    const bool no_io = true;
-    const bool is_a_filter_partition = true;
-    auto filter = table_->GetFilter(
-        prefetch_buffer.get(), handle, is_a_filter_partition, !no_io,
-        /* get_context */ nullptr, prefix_extractor);
-    if (LIKELY(filter.IsSet())) {
-      if (pin) {
-        filter_map_[handle.offset()] = std::move(filter);
-        RegisterCleanup(&ReleaseFilterCachedEntry, block_cache,
-                        filter.cache_handle);
-      } else {
-        block_cache->Release(filter.cache_handle);
-      }
-    } else {
-      delete filter.value;
-    }
-  }
-}
-
-}  // namespace rocksdb
diff --git a/table/partitioned_filter_block.h b/table/partitioned_filter_block.h
deleted file mode 100644
index 5d55da54493..00000000000
--- a/table/partitioned_filter_block.h
+++ /dev/null
@@ -1,114 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#include <list>
-#include <string>
-#include <unordered_map>
-#include "db/dbformat.h"
-#include "rocksdb/options.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/slice_transform.h"
-
-#include "table/block.h"
-#include "table/block_based_table_reader.h"
-#include "table/full_filter_block.h"
-#include "table/index_builder.h"
-#include "util/autovector.h"
-
-namespace rocksdb {
-
-class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
- public:
-  explicit PartitionedFilterBlockBuilder(
-      const SliceTransform* prefix_extractor, bool whole_key_filtering,
-      FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
-      const bool use_value_delta_encoding,
-      PartitionedIndexBuilder* const p_index_builder,
-      const uint32_t partition_size);
-
-  virtual ~PartitionedFilterBlockBuilder();
-
-  void AddKey(const Slice& key) override;
-
-  size_t NumAdded() const override { return num_added_; }
-
-  virtual Slice Finish(const BlockHandle& last_partition_block_handle,
-                       Status* status) override;
-
- private:
-  // Filter data
-  BlockBuilder index_on_filter_block_builder_;  // top-level index builder
-  BlockBuilder
-      index_on_filter_block_builder_without_seq_;  // same for user keys
-  struct FilterEntry {
-    std::string key;
-    Slice filter;
-  };
-  std::list<FilterEntry> filters;  // list of partitioned indexes and their keys
-  std::unique_ptr<IndexBuilder> value;
-  std::vector<std::unique_ptr<const char[]>> filter_gc;
-  bool finishing_filters =
-      false;  // true if Finish is called once but not complete yet.
-  // The policy of when cut a filter block and Finish it
-  void MaybeCutAFilterBlock();
-  // Currently we keep the same number of partitions for filters and indexes.
-  // This would allow for some potentioal optimizations in future. If such
-  // optimizations did not realize we can use different number of partitions and
-  // eliminate p_index_builder_
-  PartitionedIndexBuilder* const p_index_builder_;
-  // The desired number of filters per partition
-  uint32_t filters_per_partition_;
-  // The current number of filters in the last partition
-  uint32_t filters_in_partition_;
-  // Number of keys added
-  size_t num_added_;
-  BlockHandle last_encoded_handle_;
-};
-
-class PartitionedFilterBlockReader : public FilterBlockReader,
-                                     public Cleanable {
- public:
-  explicit PartitionedFilterBlockReader(
-      const SliceTransform* prefix_extractor, bool whole_key_filtering,
-      BlockContents&& contents, FilterBitsReader* filter_bits_reader,
-      Statistics* stats, const InternalKeyComparator comparator,
-      const BlockBasedTable* table, const bool index_key_includes_seq,
-      const bool index_value_is_full);
-  virtual ~PartitionedFilterBlockReader();
-
-  virtual bool IsBlockBased() override { return false; }
-  virtual bool KeyMayMatch(
-      const Slice& key, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-  virtual bool PrefixMayMatch(
-      const Slice& prefix, const SliceTransform* prefix_extractor,
-      uint64_t block_offset = kNotValid, const bool no_io = false,
-      const Slice* const const_ikey_ptr = nullptr) override;
-  virtual size_t ApproximateMemoryUsage() const override;
-
- private:
-  BlockHandle GetFilterPartitionHandle(const Slice& entry);
-  BlockBasedTable::CachableEntry<FilterBlockReader> GetFilterPartition(
-      FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle,
-      const bool no_io, bool* cached,
-      const SliceTransform* prefix_extractor = nullptr);
-  virtual void CacheDependencies(
-      bool bin, const SliceTransform* prefix_extractor) override;
-
-  const SliceTransform* prefix_extractor_;
-  std::unique_ptr<Block> idx_on_fltr_blk_;
-  const InternalKeyComparator comparator_;
-  const BlockBasedTable* table_;
-  const bool index_key_includes_seq_;
-  const bool index_value_is_full_;
-  std::unordered_map<uint64_t,
-                     BlockBasedTable::CachableEntry<FilterBlockReader>>
-      filter_map_;
-};
-
-}  // namespace rocksdb
diff --git a/table/persistent_cache_helper.cc b/table/persistent_cache_helper.cc
index 4e90697a6e5..8431f13db37 100644
--- a/table/persistent_cache_helper.cc
+++ b/table/persistent_cache_helper.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "table/persistent_cache_helper.h"
-#include "table/block_based_table_reader.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/format.h"
 
 namespace rocksdb {
diff --git a/table/plain/plain_table_bloom.cc b/table/plain/plain_table_bloom.cc
new file mode 100644
index 00000000000..0de541b5685
--- /dev/null
+++ b/table/plain/plain_table_bloom.cc
@@ -0,0 +1,78 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/plain/plain_table_bloom.h"
+
+#include <algorithm>
+#include <string>
+#include "util/dynamic_bloom.h"
+
+#include "memory/allocator.h"
+
+namespace rocksdb {
+
+namespace {
+
+uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
+  uint32_t num_blocks =
+      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
+
+  // Make num_blocks an odd number to make sure more bits are involved
+  // when determining which block.
+  if (num_blocks % 2 == 0) {
+    num_blocks++;
+  }
+
+  return num_blocks * (CACHE_LINE_SIZE * 8);
+}
+}  // namespace
+
+PlainTableBloomV1::PlainTableBloomV1(uint32_t num_probes)
+    : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
+
+void PlainTableBloomV1::SetRawData(char* raw_data, uint32_t total_bits,
+                                   uint32_t num_blocks) {
+  data_ = raw_data;
+  kTotalBits = total_bits;
+  kNumBlocks = num_blocks;
+}
+
+void PlainTableBloomV1::SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                                     uint32_t locality,
+                                     size_t huge_page_tlb_size,
+                                     Logger* logger) {
+  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
+                              : (total_bits + 7) / 8 * 8;
+  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
+
+  assert(kNumBlocks > 0 || kTotalBits > 0);
+  assert(kNumProbes > 0);
+
+  uint32_t sz = kTotalBits / 8;
+  if (kNumBlocks > 0) {
+    sz += CACHE_LINE_SIZE - 1;
+  }
+  assert(allocator);
+
+  char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
+  memset(raw, 0, sz);
+  auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
+  if (kNumBlocks > 0 && cache_line_offset > 0) {
+    raw += CACHE_LINE_SIZE - cache_line_offset;
+  }
+  data_ = raw;
+}
+
+void BloomBlockBuilder::AddKeysHashes(
+    const std::vector<uint32_t>& keys_hashes) {
+  for (auto hash : keys_hashes) {
+    bloom_.AddHash(hash);
+  }
+}
+
+Slice BloomBlockBuilder::Finish() { return bloom_.GetRawData(); }
+
+const std::string BloomBlockBuilder::kBloomBlock = "kBloomBlock";
+}  // namespace rocksdb
diff --git a/table/plain/plain_table_bloom.h b/table/plain/plain_table_bloom.h
new file mode 100644
index 00000000000..8da256b3bb8
--- /dev/null
+++ b/table/plain/plain_table_bloom.h
@@ -0,0 +1,135 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+#include "port/port.h"
+#include "util/bloom_impl.h"
+#include "util/hash.h"
+
+#include "third-party/folly/folly/ConstexprMath.h"
+
+#include <memory>
+
+namespace rocksdb {
+class Slice;
+class Allocator;
+class Logger;
+
+// A legacy Bloom filter implementation used by Plain Table db format, for
+// schema backward compatibility. Not for use in new filter applications.
+class PlainTableBloomV1 {
+ public:
+  // allocator: pass allocator to bloom filter, hence trace the usage of memory
+  // total_bits: fixed total bits for the bloom
+  // num_probes: number of hash probes for a single key
+  // locality:  If positive, optimize for cache line locality, 0 otherwise.
+  // hash_func:  customized hash function
+  // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
+  //                      within this page size. Need to reserve huge pages for
+  //                      it to be allocated, like:
+  //                         sysctl -w vm.nr_hugepages=20
+  //                     See linux doc Documentation/vm/hugetlbpage.txt
+  explicit PlainTableBloomV1(uint32_t num_probes = 6);
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger);
+
+  ~PlainTableBloomV1() {}
+
+  // Assuming single threaded access to this function.
+  void AddHash(uint32_t hash);
+
+  // Multithreaded access to this function is OK
+  bool MayContainHash(uint32_t hash) const;
+
+  void Prefetch(uint32_t hash);
+
+  uint32_t GetNumBlocks() const { return kNumBlocks; }
+
+  Slice GetRawData() const { return Slice(data_, GetTotalBits() / 8); }
+
+  void SetRawData(char* raw_data, uint32_t total_bits, uint32_t num_blocks = 0);
+
+  uint32_t GetTotalBits() const { return kTotalBits; }
+
+  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
+
+ private:
+  uint32_t kTotalBits;
+  uint32_t kNumBlocks;
+  const uint32_t kNumProbes;
+
+  char* data_;
+
+  static constexpr int LOG2_CACHE_LINE_SIZE =
+      folly::constexpr_log2(CACHE_LINE_SIZE);
+};
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189)
+#endif
+inline void PlainTableBloomV1::Prefetch(uint32_t h) {
+  if (kNumBlocks != 0) {
+    uint32_t ignored;
+    LegacyLocalityBloomImpl</*ExtraRotates*/ true>::PrepareHashMayMatch(
+        h, kNumBlocks, data_, &ignored, LOG2_CACHE_LINE_SIZE);
+  }
+}
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+inline bool PlainTableBloomV1::MayContainHash(uint32_t h) const {
+  assert(IsInitialized());
+  if (kNumBlocks != 0) {
+    return LegacyLocalityBloomImpl<true>::HashMayMatch(
+        h, kNumBlocks, kNumProbes, data_, LOG2_CACHE_LINE_SIZE);
+  } else {
+    return LegacyNoLocalityBloomImpl::HashMayMatch(h, kTotalBits, kNumProbes,
+                                                   data_);
+  }
+}
+
+inline void PlainTableBloomV1::AddHash(uint32_t h) {
+  assert(IsInitialized());
+  if (kNumBlocks != 0) {
+    LegacyLocalityBloomImpl<true>::AddHash(h, kNumBlocks, kNumProbes, data_,
+                                           LOG2_CACHE_LINE_SIZE);
+  } else {
+    LegacyNoLocalityBloomImpl::AddHash(h, kTotalBits, kNumProbes, data_);
+  }
+}
+
+class BloomBlockBuilder {
+ public:
+  static const std::string kBloomBlock;
+
+  explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
+
+  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
+                    uint32_t locality, size_t huge_page_tlb_size,
+                    Logger* logger) {
+    bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size,
+                        logger);
+  }
+
+  uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); }
+
+  void AddKeysHashes(const std::vector<uint32_t>& keys_hashes);
+
+  Slice Finish();
+
+ private:
+  PlainTableBloomV1 bloom_;
+};
+
+};  // namespace rocksdb
diff --git a/table/plain_table_builder.cc b/table/plain/plain_table_builder.cc
similarity index 97%
rename from table/plain_table_builder.cc
rename to table/plain/plain_table_builder.cc
index 453b6c768b5..696340525a7 100644
--- a/table/plain_table_builder.cc
+++ b/table/plain/plain_table_builder.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#include "table/plain_table_builder.h"
+#include "table/plain/plain_table_builder.h"
 
 #include <assert.h>
 
@@ -12,21 +12,21 @@
 #include <limits>
 #include <map>
 
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/plain_table_factory.h"
-#include "db/dbformat.h"
-#include "table/block_builder.h"
-#include "table/bloom_block.h"
-#include "table/plain_table_index.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
diff --git a/table/plain_table_builder.h b/table/plain/plain_table_builder.h
similarity index 93%
rename from table/plain_table_builder.h
rename to table/plain/plain_table_builder.h
index ca0879a4e1d..f2cd6009eb3 100644
--- a/table/plain_table_builder.h
+++ b/table/plain/plain_table_builder.h
@@ -4,6 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
+
 #ifndef ROCKSDB_LITE
 #include <stdint.h>
 #include <string>
@@ -12,9 +13,9 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
-#include "table/bloom_block.h"
-#include "table/plain_table_index.h"
-#include "table/plain_table_key_coding.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_index.h"
+#include "table/plain/plain_table_key_coding.h"
 #include "table/table_builder.h"
 
 namespace rocksdb {
@@ -24,6 +25,9 @@ class BlockHandle;
 class WritableFile;
 class TableBuilder;
 
+// The builder class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
 class PlainTableBuilder: public TableBuilder {
  public:
   // Create a builder that will store the contents of the table it is
@@ -41,6 +45,9 @@ class PlainTableBuilder: public TableBuilder {
       const std::string& column_family_name, uint32_t num_probes = 6,
       size_t huge_page_tlb_size = 0, double hash_table_ratio = 0,
       bool store_index_in_file = false);
+  // No copying allowed
+  PlainTableBuilder(const PlainTableBuilder&) = delete;
+  void operator=(const PlainTableBuilder&) = delete;
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~PlainTableBuilder();
@@ -127,10 +134,6 @@ class PlainTableBuilder: public TableBuilder {
   }
 
   bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
-
-  // No copying allowed
-  PlainTableBuilder(const PlainTableBuilder&) = delete;
-  void operator=(const PlainTableBuilder&) = delete;
 };
 
 }  // namespace rocksdb
diff --git a/table/plain_table_factory.cc b/table/plain/plain_table_factory.cc
similarity index 98%
rename from table/plain_table_factory.cc
rename to table/plain/plain_table_factory.cc
index 0dccec55242..6c6905dab1f 100644
--- a/table/plain_table_factory.cc
+++ b/table/plain/plain_table_factory.cc
@@ -4,7 +4,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #ifndef ROCKSDB_LITE
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 
 #include <stdint.h>
 #include <memory>
@@ -12,8 +12,8 @@
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
-#include "table/plain_table_builder.h"
-#include "table/plain_table_reader.h"
+#include "table/plain/plain_table_builder.h"
+#include "table/plain/plain_table_reader.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/table/plain_table_factory.h b/table/plain/plain_table_factory.h
similarity index 93%
rename from table/plain_table_factory.h
rename to table/plain/plain_table_factory.h
index dade1566096..1bd155f93e9 100644
--- a/table/plain_table_factory.h
+++ b/table/plain/plain_table_factory.h
@@ -24,7 +24,19 @@ class WritableFile;
 class Table;
 class TableBuilder;
 
-// IndexedTable requires fixed length key, configured as a constructor
+// PlainTableFactory is the entrance function to the PlainTable format of
+// SST files. It returns instances PlainTableBuilder as the builder
+// class and PlainTableReader as the reader class, where the format is
+// actually implemented.
+//
+// The PlainTable is designed for memory-mapped file systems, e.g. tmpfs.
+// Data is not organized in blocks, which allows fast access. Because of
+// following downsides
+// 1. Data compression is not supported.
+// 2. Data is not checksumed.
+// it is not recommended to use this format on other type of file systems.
+// 
+// PlainTable requires fixed length key, configured as a constructor
 // parameter of the factory class. Output file format:
 // +-------------+-----------------+
 // | version     | user_key_length |
diff --git a/table/plain_table_index.cc b/table/plain/plain_table_index.cc
similarity index 98%
rename from table/plain_table_index.cc
rename to table/plain/plain_table_index.cc
index 43740923974..b4207f348cb 100644
--- a/table/plain_table_index.cc
+++ b/table/plain/plain_table_index.cc
@@ -5,13 +5,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
+#include <cinttypes>
 
-#include <inttypes.h>
-
-#include "table/plain_table_index.h"
+#include "table/plain/plain_table_index.h"
 #include "util/coding.h"
 #include "util/hash.h"
 
diff --git a/table/plain_table_index.h b/table/plain/plain_table_index.h
similarity index 85%
rename from table/plain_table_index.h
rename to table/plain/plain_table_index.h
index 360d998279a..c4bb272282c 100644
--- a/table/plain_table_index.h
+++ b/table/plain/plain_table_index.h
@@ -11,15 +11,19 @@
 #include <vector>
 
 #include "db/dbformat.h"
+#include "memory/arena.h"
 #include "monitoring/histogram.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
-#include "util/arena.h"
-#include "util/hash.h"
-#include "util/murmurhash.h"
 
 namespace rocksdb {
 
+// The file contains two classes PlainTableIndex and PlainTableIndexBuilder
+// The two classes implement the index format of PlainTable.
+// For descripton of PlainTable format, see comments of class
+// PlainTableFactory
+//
+//
 // PlainTableIndex contains buckets size of index_size_, each is a
 // 32-bit integer. The lower 31 bits contain an offset value (explained below)
 // and the first bit of the integer indicates type of the offset.
@@ -55,6 +59,10 @@ namespace rocksdb {
 //    ....
 //   record N file offset:  fixedint32
 // <end>
+
+// The class loads the index block from a PlainTable SST file, and executes
+// the index lookup.
+// The class is used by PlainTableReader class.
 class PlainTableIndex {
  public:
   enum IndexSearchResult {
@@ -72,11 +80,22 @@ class PlainTableIndex {
         index_(nullptr),
         sub_index_(nullptr) {}
 
+  // The function that executes the lookup the hash table.
+  // The hash key is `prefix_hash`. The function fills the hash bucket
+  // content in `bucket_value`, which is up to the caller to interpret.
   IndexSearchResult GetOffset(uint32_t prefix_hash,
                               uint32_t* bucket_value) const;
 
-  Status InitFromRawData(Slice data);
+  // Initialize data from `index_data`, which points to raw data for
+  // index stored in the SST file.
+  Status InitFromRawData(Slice index_data);
 
+  // Decode the sub index for specific hash bucket.
+  // The `offset` is the value returned as `bucket_value` by GetOffset()
+  // and is only valid when the return value is `kSubindex`.
+  // The return value is the pointer to the starting address of the
+  // sub-index. `upper_bound` is filled with the value indicating how many
+  // entries the sub-index has.
   const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
                                               uint32_t* upper_bound) const {
     const char* index_ptr = &sub_index_[offset];
@@ -106,9 +125,10 @@ class PlainTableIndex {
 // After calling Finish(), it returns Slice, which is usually
 // used either to initialize PlainTableIndex or
 // to save index to sst file.
-// For more details about the  index, please refer to:
+// For more details about the index, please refer to:
 // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
 // #wiki-in-memory-index-format
+// The class is used by PlainTableBuilder class.
 class PlainTableIndexBuilder {
  public:
   PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
diff --git a/table/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc
similarity index 99%
rename from table/plain_table_key_coding.cc
rename to table/plain/plain_table_key_coding.cc
index 6f5ee9b4ad2..b70ce65e675 100644
--- a/table/plain_table_key_coding.cc
+++ b/table/plain/plain_table_key_coding.cc
@@ -4,14 +4,14 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#include "table/plain_table_key_coding.h"
+#include "table/plain/plain_table_key_coding.h"
 
 #include <algorithm>
 #include <string>
 #include "db/dbformat.h"
-#include "table/plain_table_reader.h"
-#include "table/plain_table_factory.h"
-#include "util/file_reader_writer.h"
+#include "file/writable_file_writer.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_reader.h"
 
 namespace rocksdb {
 
diff --git a/table/plain_table_key_coding.h b/table/plain/plain_table_key_coding.h
similarity index 90%
rename from table/plain_table_key_coding.h
rename to table/plain/plain_table_key_coding.h
index 9a27ad06b78..5f65d5a6560 100644
--- a/table/plain_table_key_coding.h
+++ b/table/plain/plain_table_key_coding.h
@@ -4,13 +4,19 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
+
 #ifndef ROCKSDB_LITE
 
 #include <array>
-#include "rocksdb/slice.h"
 #include "db/dbformat.h"
-#include "table/plain_table_reader.h"
+#include "rocksdb/slice.h"
+#include "table/plain/plain_table_reader.h"
 
+// The file contains three helper classes of PlainTable format,
+// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
+// These classes issue the lowest level of operations of PlainTable.
+// Actual data format of the key is documented in comments of class
+// PlainTableFactory.
 namespace rocksdb {
 
 class WritableFile;
@@ -18,8 +24,8 @@ struct ParsedInternalKey;
 struct PlainTableReaderFileInfo;
 enum PlainTableEntryType : unsigned char;
 
-// Helper class to write out a key to an output file
-// Actual data format of the key is documented in plain_table_factory.h
+// Helper class for PlainTable format to write out a key to an output file
+// The class is used in PlainTableBuilder.
 class PlainTableKeyEncoder {
  public:
   explicit PlainTableKeyEncoder(EncodingType encoding_type,
@@ -53,6 +59,10 @@ class PlainTableKeyEncoder {
   IterKey pre_prefix_;
 };
 
+// The class does raw file reads for PlainTableReader.
+// It hides whether it is a mmap-read, or a non-mmap read.
+// The class is implemented in a way to favor the performance of mmap case.
+// The class is used by PlainTableReader.
 class PlainTableFileReader {
  public:
   explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
@@ -122,7 +132,7 @@ class PlainTableFileReader {
 };
 
 // A helper class to decode keys from input buffer
-// Actual data format of the key is documented in plain_table_factory.h
+// The class is used by PlainTableBuilder.
 class PlainTableKeyDecoder {
  public:
   explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
diff --git a/table/plain_table_reader.cc b/table/plain/plain_table_reader.cc
similarity index 89%
rename from table/plain_table_reader.cc
rename to table/plain/plain_table_reader.cc
index f33afdefc38..15cd32d0b08 100644
--- a/table/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -5,7 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "table/plain_table_reader.h"
+#include "table/plain/plain_table_reader.h"
 
 #include <string>
 #include <vector>
@@ -19,24 +19,23 @@
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
 
-#include "table/block.h"
-#include "table/bloom_block.h"
-#include "table/filter_block.h"
+#include "table/block_based/block.h"
+#include "table/block_based/filter_block.h"
 #include "table/format.h"
+#include "table/get_context.h"
 #include "table/internal_iterator.h"
 #include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
 #include "table/two_level_iterator.h"
-#include "table/plain_table_factory.h"
-#include "table/plain_table_key_coding.h"
-#include "table/get_context.h"
 
+#include "memory/arena.h"
 #include "monitoring/histogram.h"
 #include "monitoring/perf_context_imp.h"
-#include "util/arena.h"
 #include "util/coding.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
-#include "util/murmurhash.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 
@@ -55,6 +54,10 @@ inline uint32_t GetFixed32Element(const char* base, size_t offset) {
 class PlainTableIterator : public InternalIterator {
  public:
   explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
+  // No copying allowed
+  PlainTableIterator(const PlainTableIterator&) = delete;
+  void operator=(const Iterator&) = delete;
+
   ~PlainTableIterator() override;
 
   bool Valid() const override;
@@ -86,9 +89,6 @@ class PlainTableIterator : public InternalIterator {
   Slice key_;
   Slice value_;
   Status status_;
-  // No copying allowed
-  PlainTableIterator(const PlainTableIterator&) = delete;
-  void operator=(const Iterator&) = delete;
 };
 
 extern const uint64_t kPlainTableMagicNumber;
@@ -127,10 +127,11 @@ Status PlainTableReader::Open(
     return Status::NotSupported("File is too large for PlainTableReader!");
   }
 
-  TableProperties* props = nullptr;
+  TableProperties* props_ptr = nullptr;
   auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                               ioptions, &props,
+                               ioptions, &props_ptr,
                                true /* compression_type_missing */);
+  std::shared_ptr<TableProperties> props(props_ptr);
   if (!s.ok()) {
     return s;
   }
@@ -164,7 +165,7 @@ Status PlainTableReader::Open(
 
   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
       ioptions, std::move(file), env_options, internal_comparator,
-      encoding_type, file_size, props, prefix_extractor));
+      encoding_type, file_size, props.get(), prefix_extractor));
 
   s = new_reader->MmapDataIfNeeded();
   if (!s.ok()) {
@@ -172,8 +173,9 @@ Status PlainTableReader::Open(
   }
 
   if (!full_scan_mode) {
-    s = new_reader->PopulateIndex(props, bloom_bits_per_key, hash_table_ratio,
-                                  index_sparseness, huge_page_tlb_size);
+    s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
+                                  hash_table_ratio, index_sparseness,
+                                  huge_page_tlb_size);
     if (!s.ok()) {
       return s;
     }
@@ -182,6 +184,8 @@ Status PlainTableReader::Open(
     // can be used.
     new_reader->full_scan_mode_ = true;
   }
+  // PopulateIndex can add to the props, so don't store them until now
+  new_reader->table_properties_ = props;
 
   if (immortal_table && new_reader->file_info_.is_mmap_mode) {
     new_reader->dummy_cleanable_.reset(new Cleanable());
@@ -196,7 +200,11 @@ void PlainTableReader::SetupForCompaction() {
 
 InternalIterator* PlainTableReader::NewIterator(
     const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
-    Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) {
+    Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
+    size_t /*compaction_readahead_size*/) {
+  // Not necessarily used here, but make sure this has been initialized
+  assert(table_properties_);
+
   bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
   if (arena == nullptr) {
     return new PlainTableIterator(this, use_prefix_seek);
@@ -258,23 +266,19 @@ Status PlainTableReader::PopulateIndexRecordList(
   return s;
 }
 
-void PlainTableReader::AllocateAndFillBloom(
-    int bloom_bits_per_key, int num_prefixes, size_t huge_page_tlb_size,
-    std::vector<uint32_t>* prefix_hashes) {
-  if (!IsTotalOrderMode()) {
-    uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
-    if (bloom_total_bits > 0) {
-      enable_bloom_ = true;
-      bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
-                          huge_page_tlb_size, ioptions_.info_log);
-      FillBloom(prefix_hashes);
-    }
+void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
+                                     size_t huge_page_tlb_size) {
+  uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
+  if (bloom_total_bits > 0) {
+    enable_bloom_ = true;
+    bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
+                        huge_page_tlb_size, ioptions_.info_log);
   }
 }
 
-void PlainTableReader::FillBloom(std::vector<uint32_t>* prefix_hashes) {
+void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
   assert(bloom_.IsInitialized());
-  for (auto prefix_hash : *prefix_hashes) {
+  for (const auto prefix_hash : prefix_hashes) {
     bloom_.AddHash(prefix_hash);
   }
 }
@@ -293,13 +297,12 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
                                        size_t index_sparseness,
                                        size_t huge_page_tlb_size) {
   assert(props != nullptr);
-  table_properties_.reset(props);
 
   BlockContents index_block_contents;
   Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
                            file_size_, kPlainTableMagicNumber, ioptions_,
                            PlainTableIndexBuilder::kPlainTableIndexBlock,
-                           &index_block_contents,
+                           BlockType::kIndex, &index_block_contents,
                            true /* compression_type_missing */);
 
   bool index_in_file = s.ok();
@@ -310,7 +313,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   if (index_in_file) {
     s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
                       file_size_, kPlainTableMagicNumber, ioptions_,
-                      BloomBlockBuilder::kBloomBlock, &bloom_block_contents,
+                      BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
+                      &bloom_block_contents,
                       true /* compression_type_missing */);
     bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
   }
@@ -351,14 +355,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   if (!index_in_file) {
     // Allocate bloom filter here for total order mode.
     if (IsTotalOrderMode()) {
-      uint32_t num_bloom_bits =
-          static_cast<uint32_t>(table_properties_->num_entries) *
-          bloom_bits_per_key;
-      if (num_bloom_bits > 0) {
-        enable_bloom_ = true;
-        bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality,
-                            huge_page_tlb_size, ioptions_.info_log);
-      }
+      AllocateBloom(bloom_bits_per_key,
+                    static_cast<uint32_t>(props->num_entries),
+                    huge_page_tlb_size);
     }
   } else if (bloom_in_file) {
     enable_bloom_ = true;
@@ -373,10 +372,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
       }
     }
     // cast away const qualifier, because bloom_ won't be changed
-    bloom_.SetRawData(
-        const_cast<unsigned char*>(
-            reinterpret_cast<const unsigned char*>(bloom_block->data())),
-        static_cast<uint32_t>(bloom_block->size()) * 8, num_blocks);
+    bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
+                      static_cast<uint32_t>(bloom_block->size()) * 8,
+                      num_blocks);
   } else {
     // Index in file but no bloom in file. Disable bloom filter in this case.
     enable_bloom_ = false;
@@ -389,6 +387,7 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
 
   std::vector<uint32_t> prefix_hashes;
   if (!index_in_file) {
+    // Populates _bloom if enabled (total order mode)
     s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
     if (!s.ok()) {
       return s;
@@ -401,10 +400,15 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   }
 
   if (!index_in_file) {
-    // Calculated bloom filter size and allocate memory for
-    // bloom filter based on the number of prefixes, then fill it.
-    AllocateAndFillBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
-                         huge_page_tlb_size, &prefix_hashes);
+    if (!IsTotalOrderMode()) {
+      // Calculated bloom filter size and allocate memory for
+      // bloom filter based on the number of prefixes, then fill it.
+      AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
+                    huge_page_tlb_size);
+      if (enable_bloom_) {
+        FillBloom(prefix_hashes);
+      }
+    }
   }
 
   // Fill two table properties.
@@ -613,7 +617,14 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
   return Status::OK();
 }
 
-uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/) {
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
+                                               TableReaderCaller /*caller*/) {
+  return 0;
+}
+
+uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
+                                           const Slice& /*end*/,
+                                           TableReaderCaller /*caller*/) {
   return 0;
 }
 
diff --git a/table/plain_table_reader.h b/table/plain/plain_table_reader.h
similarity index 85%
rename from table/plain_table_reader.h
rename to table/plain/plain_table_reader.h
index 14760f20a57..c956913a04f 100644
--- a/table/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -13,17 +13,17 @@
 #include <stdint.h>
 
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
+#include "memory/arena.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_index.h"
 #include "table/table_reader.h"
-#include "table/plain_table_factory.h"
-#include "table/plain_table_index.h"
-#include "util/arena.h"
-#include "util/dynamic_bloom.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -56,16 +56,17 @@ struct PlainTableReaderFileInfo {
         file(std::move(_file)) {}
 };
 
+// The reader class of PlainTable. For description of PlainTable format
+// See comments of class PlainTableFactory, where instances of
+// PlainTableReader are created.
+class PlainTableReader: public TableReader {
+ public:
 // Based on following output file format shown in plain_table_factory.h
-// When opening the output file, IndexedTableReader creates a hash table
-// from key prefixes to offset of the output file. IndexedTable will decide
+// When opening the output file, PlainTableReader creates a hash table
+// from key prefixes to offset of the output file. PlainTable will decide
 // whether it points to the data offset of the first key with the key prefix
 // or the offset of it. If there are too many keys share this prefix, it will
 // create a binary search-able index from the suffix to offset on disk.
-//
-// The implementation of IndexedTableReader requires output file is mmaped
-class PlainTableReader: public TableReader {
- public:
   static Status Open(const ImmutableCFOptions& ioptions,
                      const EnvOptions& env_options,
                      const InternalKeyComparator& internal_comparator,
@@ -76,11 +77,14 @@ class PlainTableReader: public TableReader {
                      bool full_scan_mode, const bool immortal_table = false,
                      const SliceTransform* prefix_extractor = nullptr);
 
+  // Returns new iterator over table contents
+  // compaction_readahead_size: its value will only be used if for_compaction =
+  // true
   InternalIterator* NewIterator(const ReadOptions&,
                                 const SliceTransform* prefix_extractor,
-                                Arena* arena = nullptr,
-                                bool skip_filters = false,
-                                bool for_compaction = false) override;
+                                Arena* arena, bool skip_filters,
+                                TableReaderCaller caller,
+                                size_t compaction_readahead_size = 0) override;
 
   void Prepare(const Slice& target) override;
 
@@ -88,7 +92,11 @@ class PlainTableReader: public TableReader {
              GetContext* get_context, const SliceTransform* prefix_extractor,
              bool skip_filters = false) override;
 
-  uint64_t ApproximateOffsetOf(const Slice& key) override;
+  uint64_t ApproximateOffsetOf(const Slice& key,
+                               TableReaderCaller caller) override;
+
+  uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                           TableReaderCaller caller) override;
 
   uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
   void SetupForCompaction() override;
@@ -148,7 +156,7 @@ class PlainTableReader: public TableReader {
 
   // Bloom filter is used to rule out non-existent key
   bool enable_bloom_;
-  DynamicBloom bloom_;
+  PlainTableBloomV1 bloom_;
   PlainTableReaderFileInfo file_info_;
   Arena arena_;
   CacheAllocationPtr index_block_alloc_;
@@ -157,7 +165,9 @@ class PlainTableReader: public TableReader {
   const ImmutableCFOptions& ioptions_;
   std::unique_ptr<Cleanable> dummy_cleanable_;
   uint64_t file_size_;
+ protected: // for testing
   std::shared_ptr<const TableProperties> table_properties_;
+ private:
 
   bool IsFixedLength() const {
     return user_key_len_ != kPlainTableVariableLength;
@@ -202,12 +212,11 @@ class PlainTableReader: public TableReader {
   Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
                                  std::vector<uint32_t>* prefix_hashes);
 
-  // Internal helper function to allocate memory for bloom filter and fill it
-  void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
-                            size_t huge_page_tlb_size,
-                            std::vector<uint32_t>* prefix_hashes);
+  // Internal helper function to allocate memory for bloom filter
+  void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
+                     size_t huge_page_tlb_size);
 
-  void FillBloom(std::vector<uint32_t>* prefix_hashes);
+  void FillBloom(const std::vector<uint32_t>& prefix_hashes);
 
   // Read the key and value at `offset` to parameters for keys, the and
   // `seekable`.
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index 54408bb50e9..48db1d8b41e 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -9,11 +9,11 @@
 
 #include "db/db_iter.h"
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
 #include "table/table_reader.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -65,8 +65,9 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& options) {
   auto sequence = options.snapshot != nullptr
                       ? options.snapshot->GetSequenceNumber()
                       : kMaxSequenceNumber;
-  auto internal_iter =
-      r->table_reader->NewIterator(options, r->moptions.prefix_extractor.get());
+  auto internal_iter = r->table_reader->NewIterator(
+      options, r->moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kSSTFileReader);
   return NewDBIterator(r->options.env, options, r->ioptions, r->moptions,
                        r->ioptions.user_comparator, internal_iter, sequence,
                        r->moptions.max_sequential_skip_in_iterations,
@@ -78,8 +79,9 @@ std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
   return rep_->table_reader->GetTableProperties();
 }
 
-Status SstFileReader::VerifyChecksum() {
-  return rep_->table_reader->VerifyChecksum();
+Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) {
+  return rep_->table_reader->VerifyChecksum(read_options,
+                                            TableReaderCaller::kSSTFileReader);
 }
 
 }  // namespace rocksdb
diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc
index 51bc975af00..dd7a5101677 100644
--- a/table/sst_file_reader_test.cc
+++ b/table/sst_file_reader_test.cc
@@ -5,14 +5,14 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "rocksdb/db.h"
 #include "rocksdb/sst_file_reader.h"
 #include "rocksdb/sst_file_writer.h"
 #include "table/sst_file_writer_collectors.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index b9a7273e07d..dc2c589f21a 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -6,12 +6,13 @@
 #include "rocksdb/sst_file_writer.h"
 
 #include <vector>
+
 #include "db/dbformat.h"
+#include "file/writable_file_writer.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_builder.h"
+#include "table/block_based/block_based_table_builder.h"
 #include "table/sst_file_writer_collectors.h"
-#include "util/file_reader_writer.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/table/table_builder.h b/table/table_builder.h
index 21df978c3eb..4a4b19b626c 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -15,10 +15,11 @@
 #include <vector>
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table_properties.h"
-#include "util/file_reader_writer.h"
+#include "trace_replay/block_cache_tracer.h"
 
 namespace rocksdb {
 
@@ -32,10 +33,12 @@ struct TableReaderOptions {
                      const EnvOptions& _env_options,
                      const InternalKeyComparator& _internal_comparator,
                      bool _skip_filters = false, bool _immortal = false,
-                     int _level = -1)
+                     int _level = -1,
+                     BlockCacheTracer* const _block_cache_tracer = nullptr)
       : TableReaderOptions(_ioptions, _prefix_extractor, _env_options,
                            _internal_comparator, _skip_filters, _immortal,
-                           _level, 0 /* _largest_seqno */) {}
+                           _level, 0 /* _largest_seqno */,
+                           _block_cache_tracer) {}
 
   // @param skip_filters Disables loading/accessing the filter block
   TableReaderOptions(const ImmutableCFOptions& _ioptions,
@@ -43,7 +46,8 @@ struct TableReaderOptions {
                      const EnvOptions& _env_options,
                      const InternalKeyComparator& _internal_comparator,
                      bool _skip_filters, bool _immortal, int _level,
-                     SequenceNumber _largest_seqno)
+                     SequenceNumber _largest_seqno,
+                     BlockCacheTracer* const _block_cache_tracer)
       : ioptions(_ioptions),
         prefix_extractor(_prefix_extractor),
         env_options(_env_options),
@@ -51,7 +55,8 @@ struct TableReaderOptions {
         skip_filters(_skip_filters),
         immortal(_immortal),
         level(_level),
-        largest_seqno(_largest_seqno) {}
+        largest_seqno(_largest_seqno),
+        block_cache_tracer(_block_cache_tracer) {}
 
   const ImmutableCFOptions& ioptions;
   const SliceTransform* prefix_extractor;
@@ -65,6 +70,7 @@ struct TableReaderOptions {
   int level;
   // largest seqno in the table
   SequenceNumber largest_seqno;
+  BlockCacheTracer* const block_cache_tracer;
 };
 
 struct TableBuilderOptions {
diff --git a/table/table_properties.cc b/table/table_properties.cc
index 8cfa2619591..6e481798c35 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -4,10 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "rocksdb/table_properties.h"
+
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
-#include "table/block.h"
+#include "table/block_based/block.h"
 #include "table/internal_iterator.h"
 #include "table/table_properties_internal.h"
 #include "util/string_util.h"
diff --git a/table/table_reader.h b/table/table_reader.h
index bd6071d9c67..712c20c9a53 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -14,6 +14,7 @@
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
 #include "table/multiget_context.h"
+#include "table/table_reader_caller.h"
 
 namespace rocksdb {
 
@@ -26,9 +27,11 @@ struct TableProperties;
 class GetContext;
 class MultiGetContext;
 
-// A Table is a sorted map from strings to strings.  Tables are
-// immutable and persistent.  A Table may be safely accessed from
-// multiple threads without external synchronization.
+// A Table (also referred to as SST) is a sorted map from strings to strings.
+// Tables are immutable and persistent.  A Table may be safely accessed from
+// multiple threads without external synchronization. Table readers are used
+// for reading various types of table formats supported by rocksdb including
+// BlockBasedTable, PlainTable and CuckooTable format.
 class TableReader {
  public:
   virtual ~TableReader() {}
@@ -42,11 +45,12 @@ class TableReader {
   //        all the states but those allocated in arena.
   // skip_filters: disables checking the bloom filters even if they exist. This
   //               option is effective only for block-based table format.
-  virtual InternalIterator* NewIterator(const ReadOptions&,
-                                        const SliceTransform* prefix_extractor,
-                                        Arena* arena = nullptr,
-                                        bool skip_filters = false,
-                                        bool for_compaction = false) = 0;
+  // compaction_readahead_size: its value will only be used if caller =
+  // kCompaction
+  virtual InternalIterator* NewIterator(
+      const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena,
+      bool skip_filters, TableReaderCaller caller,
+      size_t compaction_readahead_size = 0) = 0;
 
   virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& /*read_options*/) {
@@ -59,7 +63,14 @@ class TableReader {
   // bytes, and so includes effects like compression of the underlying data.
   // E.g., the approximate offset of the last key in the table will
   // be close to the file length.
-  virtual uint64_t ApproximateOffsetOf(const Slice& key) = 0;
+  virtual uint64_t ApproximateOffsetOf(const Slice& key,
+                                       TableReaderCaller caller) = 0;
+
+  // Given start and end keys, return the approximate data size in the file
+  // between the keys. The returned value is in terms of file bytes, and so
+  // includes effects like compression of the underlying data.
+  virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
+                                   TableReaderCaller caller) = 0;
 
   // Set up the table for Compaction. Might change some parameters with
   // posix_fadvise
@@ -112,17 +123,15 @@ class TableReader {
   }
 
   // convert db file to a human readable form
-  virtual Status DumpTable(WritableFile* /*out_file*/,
-                           const SliceTransform* /*prefix_extractor*/) {
+  virtual Status DumpTable(WritableFile* /*out_file*/) {
     return Status::NotSupported("DumpTable() not supported");
   }
 
   // check whether there is corruption in this db file
-  virtual Status VerifyChecksum() {
+  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
+                                TableReaderCaller /*caller*/) {
     return Status::NotSupported("VerifyChecksum() not supported");
   }
-
-  virtual void Close() {}
 };
 
 }  // namespace rocksdb
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index a9b75715b5f..05bb2ea25e4 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -11,21 +11,21 @@ int main() {
 }
 #else
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
+#include "file/random_access_file_reader.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/db.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_builder.h"
-#include "util/file_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
@@ -175,7 +175,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
                                    ioptions.merge_operator, ioptions.info_log,
                                    ioptions.statistics, GetContext::kNotFound,
                                    Slice(key), &value, nullptr, &merge_context,
-                                   &max_covering_tombstone_seq, env);
+                                   true, &max_covering_tombstone_seq, env);
             s = table_reader->Get(read_options, key, &get_context, nullptr);
           } else {
             s = db->Get(read_options, key, &result);
@@ -198,7 +198,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
           Iterator* iter = nullptr;
           InternalIterator* iiter = nullptr;
           if (!through_db) {
-            iiter = table_reader->NewIterator(read_options, nullptr);
+            iiter = table_reader->NewIterator(
+                read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+                /*skip_filters=*/false, TableReaderCaller::kUncategorized);
           } else {
             iter = db->NewIterator(read_options);
           }
diff --git a/table/table_reader_caller.h b/table/table_reader_caller.h
new file mode 100644
index 00000000000..90c64687197
--- /dev/null
+++ b/table/table_reader_caller.h
@@ -0,0 +1,39 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace rocksdb {
+// A list of callers for a table reader. It is used to trace the caller that
+// accesses on a block. This is only used for block cache tracing and analysis.
+// A user may use kUncategorized if the caller is not interesting for analysis
+// or the table reader is called in the test environment, e.g., unit test, table
+// reader benchmark, etc.
+enum TableReaderCaller : char {
+  kUserGet = 1,
+  kUserMultiGet = 2,
+  kUserIterator = 3,
+  kUserApproximateSize = 4,
+  kUserVerifyChecksum = 5,
+  kSSTDumpTool = 6,
+  kExternalSSTIngestion = 7,
+  kRepair = 8,
+  kPrefetch = 9,
+  kCompaction = 10,
+  // A compaction job may refill the block cache with blocks in the new SST
+  // files if paranoid_file_checks is true.
+  kCompactionRefill = 11,
+  // After building a table, it may load all its blocks into the block cache if
+  // paranoid_file_checks is true.
+  kFlush = 12,
+  // sst_file_reader.
+  kSSTFileReader = 13,
+  // A list of callers that are either not interesting for analysis or are
+  // calling from a test environment, e.g., unit test, benchmark, etc.
+  kUncategorized = 14,
+  // All callers should be added before kMaxBlockCacheLookupCaller.
+  kMaxBlockCacheLookupCaller
+};
+}  // namespace rocksdb
diff --git a/table/table_test.cc b/table/table_test.cc
index a62ce4255e3..77b96259830 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -16,11 +16,13 @@
 #include <string>
 #include <vector>
 
+#include "block_fetcher.h"
 #include "cache/lru_cache.h"
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "memtable/stl_wrappers.h"
+#include "meta_blocks.h"
 #include "monitoring/statistics.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
@@ -32,26 +34,24 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "table/block.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_based_table_reader.h"
-#include "table/block_builder.h"
-#include "table/block_fetcher.h"
-#include "table/flush_block_policy.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#include "table/block_based/flush_block_policy.h"
 #include "table/format.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
-#include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/sst_file_writer_collectors.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/compression.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
@@ -63,6 +63,8 @@ extern const uint64_t kPlainTableMagicNumber;
 
 namespace {
 
+const std::string kDummyValue(10000, 'o');
+
 // DummyPropertiesCollector used to test BlockBasedTableProperties
 class DummyPropertiesCollector : public TablePropertiesCollector {
  public:
@@ -236,7 +238,7 @@ class BlockConstructor: public Constructor {
   }
   InternalIterator* NewIterator(
       const SliceTransform* /*prefix_extractor*/) const override {
-    return block_->NewIterator<DataBlockIter>(comparator_, comparator_);
+    return block_->NewDataIterator(comparator_, comparator_);
   }
 
  private:
@@ -308,10 +310,13 @@ class TableConstructor: public Constructor {
  public:
   explicit TableConstructor(const Comparator* cmp,
                             bool convert_to_internal_key = false,
-                            int level = -1)
+                            int level = -1, SequenceNumber largest_seqno = 0)
       : Constructor(cmp),
+        largest_seqno_(largest_seqno),
         convert_to_internal_key_(convert_to_internal_key),
-        level_(level) {}
+        level_(level) {
+    env_ = rocksdb::Env::Default();
+  }
   ~TableConstructor() override { Reset(); }
 
   Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions,
@@ -326,6 +331,14 @@ class TableConstructor: public Constructor {
     std::unique_ptr<TableBuilder> builder;
     std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
         int_tbl_prop_collector_factories;
+
+    if (largest_seqno_ != 0) {
+      // Pretend that it's an external file written by SstFileWriter.
+      int_tbl_prop_collector_factories.emplace_back(
+          new SstFileWriterPropertiesCollectorFactory(2 /* version */,
+                                                      0 /* global_seqno*/));
+    }
+
     std::string column_family_name;
     builder.reset(ioptions.table_factory->NewTableBuilder(
         TableBuilderOptions(ioptions, moptions, internal_comparator,
@@ -362,7 +375,7 @@ class TableConstructor: public Constructor {
     return ioptions.table_factory->NewTableReader(
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
                            internal_comparator, !kSkipFilters, !kImmortal,
-                           level_),
+                           level_, largest_seqno_, &block_cache_tracer_),
         std::move(file_reader_), TEST_GetSink()->contents().size(),
         &table_reader_);
   }
@@ -370,7 +383,9 @@ class TableConstructor: public Constructor {
   InternalIterator* NewIterator(
       const SliceTransform* prefix_extractor) const override {
     ReadOptions ro;
-    InternalIterator* iter = table_reader_->NewIterator(ro, prefix_extractor);
+    InternalIterator* iter = table_reader_->NewIterator(
+        ro, prefix_extractor, /*arena=*/nullptr, /*skip_filters=*/false,
+        TableReaderCaller::kUncategorized);
     if (convert_to_internal_key_) {
       return new KeyConvertingIterator(iter);
     } else {
@@ -382,9 +397,11 @@ class TableConstructor: public Constructor {
     if (convert_to_internal_key_) {
       InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
       const Slice skey = ikey.Encode();
-      return table_reader_->ApproximateOffsetOf(skey);
+      return table_reader_->ApproximateOffsetOf(
+          skey, TableReaderCaller::kUncategorized);
     }
-    return table_reader_->ApproximateOffsetOf(key);
+    return table_reader_->ApproximateOffsetOf(
+        key, TableReaderCaller::kUncategorized);
   }
 
   virtual Status Reopen(const ImmutableCFOptions& ioptions,
@@ -412,6 +429,8 @@ class TableConstructor: public Constructor {
     return static_cast<test::StringSink*>(file_writer_->writable_file());
   }
 
+  BlockCacheTracer block_cache_tracer_;
+
  private:
   void Reset() {
     uniq_id_ = 0;
@@ -424,6 +443,7 @@ class TableConstructor: public Constructor {
   std::unique_ptr<WritableFileWriter> file_writer_;
   std::unique_ptr<RandomAccessFileReader> file_reader_;
   std::unique_ptr<TableReader> table_reader_;
+  SequenceNumber largest_seqno_;
   bool convert_to_internal_key_;
   int level_;
 
@@ -431,6 +451,7 @@ class TableConstructor: public Constructor {
 
   static uint64_t cur_uniq_id_;
   EnvOptions soptions;
+  Env* env_;
 };
 uint64_t TableConstructor::cur_uniq_id_ = 1;
 
@@ -1049,7 +1070,9 @@ class BlockBasedTableTest
     : public TableTest,
       virtual public ::testing::WithParamInterface<uint32_t> {
  public:
-  BlockBasedTableTest() : format_(GetParam()) {}
+  BlockBasedTableTest() : format_(GetParam()) {
+    env_ = rocksdb::Env::Default();
+  }
 
   BlockBasedTableOptions GetBlockBasedTableOptions() {
     BlockBasedTableOptions options;
@@ -1057,11 +1080,91 @@ class BlockBasedTableTest
     return options;
   }
 
+  void SetupTracingTest(TableConstructor* c) {
+    test_path_ = test::PerThreadDBPath("block_based_table_tracing_test");
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace_file";
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_,
+                                 &trace_writer));
+    c->block_cache_tracer_.StartTrace(env_, trace_opt, std::move(trace_writer));
+    {
+      std::string user_key = "k01";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c->Add(encoded_key, kDummyValue);
+    }
+    {
+      std::string user_key = "k02";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c->Add(encoded_key, kDummyValue);
+    }
+  }
+
+  void VerifyBlockAccessTrace(
+      TableConstructor* c,
+      const std::vector<BlockCacheTraceRecord>& expected_records) {
+    c->block_cache_tracer_.EndTrace();
+
+    std::unique_ptr<TraceReader> trace_reader;
+    Status s =
+        NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
+    EXPECT_OK(s);
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    EXPECT_OK(reader.ReadHeader(&header));
+    uint32_t index = 0;
+    while (s.ok()) {
+      BlockCacheTraceRecord access;
+      s = reader.ReadAccess(&access);
+      if (!s.ok()) {
+        break;
+      }
+      ASSERT_LT(index, expected_records.size());
+      EXPECT_NE("", access.block_key);
+      EXPECT_EQ(access.block_type, expected_records[index].block_type);
+      EXPECT_GT(access.block_size, 0);
+      EXPECT_EQ(access.caller, expected_records[index].caller);
+      EXPECT_EQ(access.no_insert, expected_records[index].no_insert);
+      EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit);
+      // Get
+      if (access.caller == TableReaderCaller::kUserGet) {
+        EXPECT_EQ(access.referenced_key,
+                  expected_records[index].referenced_key);
+        EXPECT_EQ(access.get_id, expected_records[index].get_id);
+        EXPECT_EQ(access.get_from_user_specified_snapshot,
+                  expected_records[index].get_from_user_specified_snapshot);
+        if (access.block_type == TraceType::kBlockTraceDataBlock) {
+          EXPECT_GT(access.referenced_data_size, 0);
+          EXPECT_GT(access.num_keys_in_block, 0);
+          EXPECT_EQ(access.referenced_key_exist_in_block,
+                    expected_records[index].referenced_key_exist_in_block);
+        }
+      } else {
+        EXPECT_EQ(access.referenced_key, "");
+        EXPECT_EQ(access.get_id, 0);
+        EXPECT_TRUE(access.get_from_user_specified_snapshot == Boolean::kFalse);
+        EXPECT_EQ(access.referenced_data_size, 0);
+        EXPECT_EQ(access.num_keys_in_block, 0);
+        EXPECT_TRUE(access.referenced_key_exist_in_block == Boolean::kFalse);
+      }
+      index++;
+    }
+    EXPECT_EQ(index, expected_records.size());
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
  protected:
   uint64_t IndexUncompressedHelper(bool indexCompress);
 
  private:
   uint32_t format_;
+  Env* env_;
+  std::string trace_file_path_;
+  std::string test_path_;
 };
 class PlainTableTest : public TableTest {};
 class TablePropertyTest : public testing::Test {};
@@ -1480,7 +1583,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) {
 
 TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i <= 5; ++i) {
     Options options;
     // Make each key/value an individual block
     table_options.block_size = 64;
@@ -1511,11 +1614,16 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
       options.prefix_extractor.reset(NewFixedPrefixTransform(4));
       break;
     case 4:
-    default:
-      // Binary search index
+      // Two-level index
       table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
       options.table_factory.reset(new BlockBasedTableFactory(table_options));
       break;
+    case 5:
+      // Binary search with first key
+      table_options.index_type =
+          BlockBasedTableOptions::kBinarySearchWithFirstKey;
+      options.table_factory.reset(new BlockBasedTableFactory(table_options));
+      break;
     }
 
     TableConstructor c(BytewiseComparator(),
@@ -1538,8 +1646,9 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
     auto* reader = c.GetTableReader();
     ReadOptions ro;
     ro.total_order_seek = true;
-    std::unique_ptr<InternalIterator> iter(
-        reader->NewIterator(ro, moptions.prefix_extractor.get()));
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
     iter->Seek(InternalKey("b", 0, kTypeValue).Encode());
     ASSERT_OK(iter->status());
@@ -1597,8 +1706,9 @@ TEST_P(BlockBasedTableTest, NoopTransformSeek) {
   for (int i = 0; i < 2; ++i) {
     ReadOptions ro;
     ro.total_order_seek = (i == 0);
-    std::unique_ptr<InternalIterator> iter(
-        reader->NewIterator(ro, moptions.prefix_extractor.get()));
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
     iter->Seek(key.Encode());
     ASSERT_OK(iter->status());
@@ -1635,8 +1745,9 @@ TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) {
   const MutableCFOptions new_moptions(options);
   c.Reopen(new_ioptions, new_moptions);
   auto reader = c.GetTableReader();
-  std::unique_ptr<InternalIterator> db_iter(
-      reader->NewIterator(ReadOptions(), new_moptions.prefix_extractor.get()));
+  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+      ReadOptions(), new_moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // Test point lookup
   // only one kv
@@ -1656,10 +1767,10 @@ static std::string RandomString(Random* rnd, int len) {
 }
 
 void AddInternalKey(TableConstructor* c, const std::string& prefix,
-                    int /*suffix_len*/ = 800) {
+                    std::string value = "v", int /*suffix_len*/ = 800) {
   static Random rnd(1023);
   InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
-  c->Add(k.Encode().ToString(), "v");
+  c->Add(k.Encode().ToString(), value);
 }
 
 void TableTest::IndexTest(BlockBasedTableOptions table_options) {
@@ -1702,8 +1813,9 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) {
   ASSERT_EQ(5u, props->num_data_blocks);
 
   // TODO(Zhongyi): update test to use MutableCFOptions
-  std::unique_ptr<InternalIterator> index_iter(
-      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
+  std::unique_ptr<InternalIterator> index_iter(reader->NewIterator(
+      ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // -- Find keys do not exist, but have common prefix.
   std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
@@ -1798,6 +1910,325 @@ TEST_P(BlockBasedTableTest, PartitionIndexTest) {
   }
 }
 
+TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  Options options;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+
+  TableConstructor c(BytewiseComparator());
+  AddInternalKey(&c, "pika");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  ASSERT_EQ(1, keys.size());
+
+  auto reader = c.GetTableReader();
+  ReadOptions ropt;
+  ropt.read_tier = ReadTier::kBlockCacheTier;
+  std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+      ropt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  auto ikey = [](Slice user_key) {
+    return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+  };
+
+  iter->Seek(ikey("pika"));
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // This used to crash at some point.
+  iter->Seek(ikey("pika"));
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+  IndexTest(table_options);
+}
+
+class CustomFlushBlockPolicy : public FlushBlockPolicyFactory,
+                               public FlushBlockPolicy {
+ public:
+  explicit CustomFlushBlockPolicy(std::vector<int> keys_per_block)
+      : keys_per_block_(keys_per_block) {}
+
+  const char* Name() const override { return "table_test"; }
+  FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
+                                        const BlockBuilder&) const override {
+    return new CustomFlushBlockPolicy(keys_per_block_);
+  }
+
+  bool Update(const Slice&, const Slice&) override {
+    if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) {
+      ++current_block_idx_;
+      keys_in_current_block_ = 1;
+      return true;
+    }
+
+    ++keys_in_current_block_;
+    return false;
+  }
+
+  std::vector<int> keys_per_block_;
+
+  int current_block_idx_ = 0;
+  int keys_in_current_block_ = 0;
+};
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) {
+  for (int use_first_key = 0; use_first_key < 2; ++use_first_key) {
+    SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key));
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+    table_options.index_type =
+        use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey
+                      : BlockBasedTableOptions::kBinarySearch;
+    table_options.block_cache = NewLRUCache(10000);  // fits all blocks
+    table_options.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    table_options.flush_block_policy_factory =
+        std::make_shared<CustomFlushBlockPolicy>(std::vector<int>{2, 1, 3, 2});
+    Options options;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.statistics = CreateDBStatistics();
+    Statistics* stats = options.statistics.get();
+    std::unique_ptr<InternalKeyComparator> comparator(
+        new InternalKeyComparator(BytewiseComparator()));
+    const ImmutableCFOptions ioptions(options);
+    const MutableCFOptions moptions(options);
+
+    TableConstructor c(BytewiseComparator());
+
+    // Block 0.
+    AddInternalKey(&c, "aaaa", "v0");
+    AddInternalKey(&c, "aaac", "v1");
+
+    // Block 1.
+    AddInternalKey(&c, "aaca", "v2");
+
+    // Block 2.
+    AddInternalKey(&c, "caaa", "v3");
+    AddInternalKey(&c, "caac", "v4");
+    AddInternalKey(&c, "caae", "v5");
+
+    // Block 3.
+    AddInternalKey(&c, "ccaa", "v6");
+    AddInternalKey(&c, "ccac", "v7");
+
+    // Write the file.
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+             &kvmap);
+    ASSERT_EQ(8, keys.size());
+
+    auto reader = c.GetTableReader();
+    auto props = reader->GetTableProperties();
+    ASSERT_EQ(4u, props->num_data_blocks);
+    std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+        ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    // Shouldn't have read data blocks before iterator is seeked.
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    auto ikey = [](Slice user_key) {
+      return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+    };
+
+    // Seek to a key between blocks. If index contains first key, we shouldn't
+    // read any data blocks until value is requested.
+    iter->Seek(ikey("aaba"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 0 : 1,
+              stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ("v2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to the middle of a block. The block should be read right away.
+    iter->Seek(ikey("caab"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[4], iter->key().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v4", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to just before the same block and don't access value.
+    // The iterator should keep pinning the block contents.
+    iter->Seek(ikey("baaa"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[3], iter->key().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to the same block again to check that the block is still pinned.
+    iter->Seek(ikey("caae"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[5], iter->key().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v5", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward and fall through to the next block. Don't access value.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[6], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward again. Block should be read.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[7], iter->key().ToString());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ("v7", iter->value().ToString());
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Step forward and reach the end.
+    iter->Next();
+    EXPECT_FALSE(iter->Valid());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to a single-key block and step forward without accessing value.
+    iter->Seek(ikey("aaca"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 0 : 1,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[3], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 1 : 2,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v3", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    // Seek between blocks and step back without accessing value.
+    iter->Seek(ikey("aaca"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[1], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 3,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    // All blocks are in cache now, there'll be no more misses ever.
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ("v1", iter->value().ToString());
+
+    // Next into the next block again.
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[2], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 4,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Seek to first and step back without accessing value.
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[0], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 2 : 5,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Prev();
+    EXPECT_FALSE(iter->Valid());
+    EXPECT_EQ(use_first_key ? 2 : 5,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // Do some SeekForPrev() and SeekToLast() just to cover all methods.
+    iter->SeekForPrev(ikey("caad"));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[4], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 3 : 6,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v4", iter->value().ToString());
+    EXPECT_EQ(use_first_key ? 3 : 6,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(keys[7], iter->key().ToString());
+    EXPECT_EQ(use_first_key ? 4 : 7,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ("v7", iter->value().ToString());
+    EXPECT_EQ(use_first_key ? 4 : 7,
+              stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    c.ResetTableReader();
+  }
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
+  table_options.block_cache = NewLRUCache(10000);
+  Options options;
+  options.statistics = CreateDBStatistics();
+  Statistics* stats = options.statistics.get();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::unique_ptr<InternalKeyComparator> comparator(
+      new InternalKeyComparator(BytewiseComparator()));
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+
+  TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false,
+                     /* level */ -1, /* largest_seqno */ 42);
+
+  c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x");
+  c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y");
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
+  ASSERT_EQ(2, keys.size());
+
+  auto reader = c.GetTableReader();
+  auto props = reader->GetTableProperties();
+  ASSERT_EQ(1u, props->num_data_blocks);
+  std::unique_ptr<InternalIterator> iter(reader->NewIterator(
+      ReadOptions(), /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString());
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+            iter->key().ToString());
+  EXPECT_NE(keys[0], iter->key().ToString());
+  // Key should have been served from index, without reading data blocks.
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+  EXPECT_EQ("x", iter->value().ToString());
+  EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+  EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
+            iter->key().ToString());
+
+  c.ResetTableReader();
+}
+
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.
@@ -1869,6 +2300,187 @@ TEST_P(BlockBasedTableTest, NumBlockStat) {
   c.ResetTableReader();
 }
 
+TEST_P(BlockBasedTableTest, TracingGetTest) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  std::string user_key = "k01";
+  InternalKey internal_key(user_key, 0, kTypeValue);
+  std::string encoded_key = internal_key.Encode().ToString();
+  for (uint32_t i = 1; i <= 2; i++) {
+    PinnableSlice value;
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, user_key, &value, nullptr,
+                           nullptr, true, nullptr, nullptr, nullptr, nullptr,
+                           nullptr, nullptr, /*tracing_get_id=*/i);
+    get_perf_context()->Reset();
+    ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context,
+                                      moptions.prefix_extractor.get()));
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value.ToString(), kDummyValue);
+  }
+
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = Boolean::kFalse;
+  record.no_insert = Boolean::kFalse;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have three records for one index, one filter, and one data
+  // block access.
+  record.get_id = 1;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = Boolean::kFalse;
+  record.referenced_key = encoded_key;
+  record.referenced_key_exist_in_block = Boolean::kTrue;
+  record.is_cache_hit = Boolean::kTrue;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  record.is_cache_hit = Boolean::kFalse;
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  // The second get should all observe cache hits.
+  record.is_cache_hit = Boolean::kTrue;
+  record.get_id = 2;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = Boolean::kFalse;
+  record.referenced_key = encoded_key;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  for (uint32_t i = 1; i <= 2; i++) {
+    std::string user_key = "k01";
+    InternalKey internal_key(user_key, 0, kTypeValue);
+    std::string encoded_key = internal_key.Encode().ToString();
+    c.GetTableReader()->ApproximateOffsetOf(
+        encoded_key, TableReaderCaller::kUserApproximateSize);
+  }
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = Boolean::kFalse;
+  record.no_insert = Boolean::kFalse;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have two records for only index blocks.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserApproximateSize;
+  record.is_cache_hit = Boolean::kTrue;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, TracingIterator) {
+  TableConstructor c(BytewiseComparator());
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.create_if_missing = true;
+  table_options.block_cache = NewLRUCache(1024 * 1024, 0);
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  SetupTracingTest(&c);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  for (uint32_t i = 1; i <= 2; i++) {
+    std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
+        ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUserIterator));
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      iter->key();
+      iter->value();
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+    iter.reset();
+  }
+
+  // Verify traces.
+  std::vector<BlockCacheTraceRecord> expected_records;
+  // The first two records should be prefetching index and filter blocks.
+  BlockCacheTraceRecord record;
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kPrefetch;
+  record.is_cache_hit = Boolean::kFalse;
+  record.no_insert = Boolean::kFalse;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceFilterBlock;
+  expected_records.push_back(record);
+  // Then we should have three records for index and two data block access.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.caller = TableReaderCaller::kUserIterator;
+  record.is_cache_hit = Boolean::kTrue;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  record.is_cache_hit = Boolean::kFalse;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  // When we iterate this file for the second time, we should observe all cache
+  // hits.
+  record.block_type = TraceType::kBlockTraceIndexBlock;
+  record.is_cache_hit = Boolean::kTrue;
+  expected_records.push_back(record);
+  record.block_type = TraceType::kBlockTraceDataBlock;
+  expected_records.push_back(record);
+  expected_records.push_back(record);
+  VerifyBlockAccessTrace(&c, expected_records);
+  c.ResetTableReader();
+}
+
 // A simple tool that takes the snapshot of block cache statistics.
 class BlockCachePropertiesSnapshot {
  public:
@@ -1954,8 +2566,8 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
 
   // preloading filter/index blocks is enabled.
   auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
-  ASSERT_TRUE(reader->TEST_filter_block_preloaded());
-  ASSERT_TRUE(reader->TEST_index_reader_preloaded());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+  ASSERT_FALSE(reader->TEST_IndexBlockInCache());
 
   {
     // nothing happens in the beginning
@@ -1967,7 +2579,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
   {
     GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                            GetContext::kNotFound, Slice(), nullptr, nullptr,
-                           nullptr, nullptr, nullptr);
+                           nullptr, true, nullptr, nullptr);
     // a hack that just to trigger BlockBasedTable::GetFilter.
     reader->Get(ReadOptions(), "non-exist-key", &get_context,
                 moptions.prefix_extractor.get());
@@ -1987,7 +2599,11 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
 
   // Enable the cache for index/filter blocks
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
-  table_options.block_cache = NewLRUCache(2048, 2);
+  LRUCacheOptions co;
+  co.capacity = 2048;
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_options.block_cache = NewLRUCache(co);
   table_options.cache_index_and_filter_blocks = true;
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   std::vector<std::string> keys;
@@ -2001,8 +2617,8 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   // preloading filter/index blocks is prohibited.
   auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
-  ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
-  ASSERT_TRUE(!reader->TEST_index_reader_preloaded());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
+  ASSERT_TRUE(reader->TEST_IndexBlockInCache());
 
   // -- PART 1: Open with regular block cache.
   // Since block_cache is disabled, no cache activities will be involved.
@@ -2017,7 +2633,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
                       0, 0, 0);
     ASSERT_EQ(props.GetCacheBytesRead(), 0);
     ASSERT_EQ(props.GetCacheBytesWrite(),
-              table_options.block_cache->GetUsage());
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
     last_cache_bytes_read = props.GetCacheBytesRead();
   }
 
@@ -2033,7 +2649,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
     // Cache hit, bytes read from cache should increase
     ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
     ASSERT_EQ(props.GetCacheBytesWrite(),
-              table_options.block_cache->GetUsage());
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
     last_cache_bytes_read = props.GetCacheBytesRead();
   }
 
@@ -2046,7 +2662,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
     // Cache miss, Bytes read from cache should not change
     ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
     ASSERT_EQ(props.GetCacheBytesWrite(),
-              table_options.block_cache->GetUsage());
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
     last_cache_bytes_read = props.GetCacheBytesRead();
   }
 
@@ -2060,7 +2676,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
     // Cache hit, bytes read from cache should increase
     ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
     ASSERT_EQ(props.GetCacheBytesWrite(),
-              table_options.block_cache->GetUsage());
+              static_cast<int64_t>(table_options.block_cache->GetUsage()));
   }
   // release the iterator so that the block cache can reset correctly.
   iter.reset();
@@ -2134,11 +2750,11 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
   MutableCFOptions moptions4(options);
   ASSERT_OK(c3.Reopen(ioptions4, moptions4));
   reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
-  ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
+  ASSERT_FALSE(reader->TEST_FilterBlockInCache());
   PinnableSlice value;
   GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, user_key, &value, nullptr,
-                         nullptr, nullptr, nullptr);
+                         nullptr, true, nullptr, nullptr);
   ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
                         moptions4.prefix_extractor.get()));
   ASSERT_STREQ(value.data(), "hello");
@@ -2221,21 +2837,25 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
                GetPlainInternalComparator(options.comparator), &keys, &kvmap);
       auto reader = c.GetTableReader();
       PinnableSlice value;
-      GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
-                             GetContext::kNotFound, user_key, &value, nullptr,
-                             nullptr, nullptr, nullptr);
-      get_perf_context()->Reset();
-      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
-                            moptions.prefix_extractor.get()));
-      if (index_and_filter_in_cache) {
-        // data, index and filter block
-        ASSERT_EQ(get_perf_context()->block_read_count, 3);
-      } else {
-        // just the data block
-        ASSERT_EQ(get_perf_context()->block_read_count, 1);
+      {
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, true, nullptr, nullptr);
+        get_perf_context()->Reset();
+        ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        if (index_and_filter_in_cache) {
+          // data, index and filter block
+          ASSERT_EQ(get_perf_context()->block_read_count, 3);
+          ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
+        } else {
+          // just the data block
+          ASSERT_EQ(get_perf_context()->block_read_count, 1);
+        }
+        ASSERT_EQ(get_context.State(), GetContext::kFound);
+        ASSERT_STREQ(value.data(), "hello");
       }
-      ASSERT_EQ(get_context.State(), GetContext::kFound);
-      ASSERT_STREQ(value.data(), "hello");
 
       // Get non-existing key
       user_key = "does-not-exist";
@@ -2243,21 +2863,26 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
       encoded_key = internal_key.Encode().ToString();
 
       value.Reset();
-      get_context = GetContext(options.comparator, nullptr, nullptr, nullptr,
+      {
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                                GetContext::kNotFound, user_key, &value, nullptr,
-                               nullptr, nullptr, nullptr);
-      get_perf_context()->Reset();
-      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
-                            moptions.prefix_extractor.get()));
-      ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+                               nullptr, true, nullptr, nullptr);
+        get_perf_context()->Reset();
+        ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+      }
 
       if (index_and_filter_in_cache) {
         if (bloom_filter_type == 0) {
           // with block-based, we read index and then the filter
           ASSERT_EQ(get_perf_context()->block_read_count, 2);
+          ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
         } else {
           // with full-filter, we read filter first and then we stop
           ASSERT_EQ(get_perf_context()->block_read_count, 1);
+          ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
         }
       } else {
         // filter is already in memory and it figures out that the key doesn't
@@ -2268,176 +2893,6 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
   }
 }
 
-// A wrapper around LRICache that also keeps track of data blocks (in contrast
-// with the objects) in the cache. The class is very simple and can be used only
-// for trivial tests.
-class MockCache : public LRUCache {
- public:
-  MockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-            double high_pri_pool_ratio)
-      : LRUCache(capacity, num_shard_bits, strict_capacity_limit,
-                 high_pri_pool_ratio) {}
-  Status Insert(const Slice& key, void* value, size_t charge,
-                void (*deleter)(const Slice& key, void* value),
-                Handle** handle = nullptr,
-                Priority priority = Priority::LOW) override {
-    // Replace the deleter with our own so that we keep track of data blocks
-    // erased from the cache
-    deleters_[key.ToString()] = deleter;
-    return ShardedCache::Insert(key, value, charge, &MockDeleter, handle,
-                                priority);
-  }
-  // This is called by the application right after inserting a data block
-  void TEST_mark_as_data_block(const Slice& key, size_t charge) override {
-    marked_data_in_cache_[key.ToString()] = charge;
-    marked_size_ += charge;
-  }
-  using DeleterFunc = void (*)(const Slice& key, void* value);
-  static std::map<std::string, DeleterFunc> deleters_;
-  static std::map<std::string, size_t> marked_data_in_cache_;
-  static size_t marked_size_;
-  static void MockDeleter(const Slice& key, void* value) {
-    // If the item was marked for being data block, decrease its usage from  the
-    // total data block usage of the cache
-    if (marked_data_in_cache_.find(key.ToString()) !=
-        marked_data_in_cache_.end()) {
-      marked_size_ -= marked_data_in_cache_[key.ToString()];
-    }
-    // Then call the origianl deleter
-    assert(deleters_.find(key.ToString()) != deleters_.end());
-    auto deleter = deleters_[key.ToString()];
-    deleter(key, value);
-  }
-};
-
-size_t MockCache::marked_size_ = 0;
-std::map<std::string, MockCache::DeleterFunc> MockCache::deleters_;
-std::map<std::string, size_t> MockCache::marked_data_in_cache_;
-
-// Block cache can contain raw data blocks as well as general objects. If an
-// object depends on the table to be live, it then must be destructed before the
-// table is closed. This test makes sure that the only items remains in the
-// cache after the table is closed are raw data blocks.
-TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) {
-  std::vector<CompressionType> compression_types{kNoCompression};
-
-  // The following are the compression library versions supporting compression
-  // dictionaries. See the test case CacheCompressionDict in the
-  // DBBlockCacheTest suite.
-#ifdef ZLIB
-  compression_types.push_back(kZlibCompression);
-#endif  // ZLIB
-#if LZ4_VERSION_NUMBER >= 10400
-  compression_types.push_back(kLZ4Compression);
-  compression_types.push_back(kLZ4HCCompression);
-#endif  // LZ4_VERSION_NUMBER >= 10400
-#if ZSTD_VERSION_NUMBER >= 500
-  compression_types.push_back(kZSTD);
-#endif  // ZSTD_VERSION_NUMBER >= 500
-
-  for (int level: {-1, 0, 1, 10}) {
-    for (auto index_type :
-        {BlockBasedTableOptions::IndexType::kBinarySearch,
-        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}) {
-      for (bool block_based_filter : {true, false}) {
-        for (bool partition_filter : {true, false}) {
-          if (partition_filter &&
-              (block_based_filter ||
-               index_type !=
-               BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch)) {
-            continue;
-          }
-          for (bool index_and_filter_in_cache : {true, false}) {
-            for (bool pin_l0 : {true, false}) {
-              for (bool pin_top_level : {true, false}) {
-                if (pin_l0 && !index_and_filter_in_cache) {
-                  continue;
-                }
-
-                for (auto compression_type : compression_types) {
-                  for (uint32_t max_dict_bytes : {0, 1 << 14}) {
-                    if (compression_type == kNoCompression && max_dict_bytes)
-                      continue;
-
-                    // Create a table
-                    Options opt;
-                    std::unique_ptr<InternalKeyComparator> ikc;
-                    ikc.reset(new test::PlainInternalKeyComparator(
-                      opt.comparator));
-                    opt.compression = compression_type;
-                    opt.compression_opts.max_dict_bytes = max_dict_bytes;
-                    BlockBasedTableOptions table_options =
-                      GetBlockBasedTableOptions();
-                    table_options.block_size = 1024;
-                    table_options.index_type = index_type;
-                    table_options.pin_l0_filter_and_index_blocks_in_cache =
-                      pin_l0;
-                    table_options.pin_top_level_index_and_filter =
-                      pin_top_level;
-                    table_options.partition_filters = partition_filter;
-                    table_options.cache_index_and_filter_blocks =
-                      index_and_filter_in_cache;
-                    // big enough so we don't ever lose cached values.
-                    table_options.block_cache = std::make_shared<MockCache>(
-                      16 * 1024 * 1024, 4, false, 0.0);
-                    table_options.filter_policy.reset(
-                      rocksdb::NewBloomFilterPolicy(10, block_based_filter));
-                    opt.table_factory.reset(NewBlockBasedTableFactory(
-                      table_options));
-
-                    bool convert_to_internal_key = false;
-                    TableConstructor c(BytewiseComparator(),
-                      convert_to_internal_key, level);
-                    std::string user_key = "k01";
-                    std::string key =
-                      InternalKey(user_key, 0, kTypeValue).Encode().ToString();
-                    c.Add(key, "hello");
-                    std::vector<std::string> keys;
-                    stl_wrappers::KVMap kvmap;
-                    const ImmutableCFOptions ioptions(opt);
-                    const MutableCFOptions moptions(opt);
-                    c.Finish(opt, ioptions, moptions, table_options, *ikc,
-                      &keys, &kvmap);
-
-                    // Doing a read to make index/filter loaded into the cache
-                    auto table_reader =
-                      dynamic_cast<BlockBasedTable*>(c.GetTableReader());
-                    PinnableSlice value;
-                    GetContext get_context(opt.comparator, nullptr, nullptr,
-                      nullptr, GetContext::kNotFound, user_key, &value,
-                      nullptr, nullptr, nullptr, nullptr);
-                    InternalKey ikey(user_key, 0, kTypeValue);
-                    auto s = table_reader->Get(ReadOptions(), key, &get_context,
-                      moptions.prefix_extractor.get());
-                    ASSERT_EQ(get_context.State(), GetContext::kFound);
-                    ASSERT_STREQ(value.data(), "hello");
-
-                    // Close the table
-                    c.ResetTableReader();
-
-                    auto usage = table_options.block_cache->GetUsage();
-                    auto pinned_usage =
-                      table_options.block_cache->GetPinnedUsage();
-                    // The only usage must be for marked data blocks
-                    ASSERT_EQ(usage, MockCache::marked_size_);
-                    // There must be some pinned data since PinnableSlice has
-                    // not released them yet
-                    ASSERT_GT(pinned_usage, 0);
-                    // Release pinnable slice reousrces
-                    value.Reset();
-                    pinned_usage = table_options.block_cache->GetPinnedUsage();
-                    ASSERT_EQ(pinned_usage, 0);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  } // level
-}
-
 TEST_P(BlockBasedTableTest, BlockCacheLeak) {
   // Check that when we reopen a table we don't lose access to blocks already
   // in the cache. This test checks whether the Table actually makes use of the
@@ -2574,69 +3029,6 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) {
   EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0);
 }
 
-TEST_P(BlockBasedTableTest, NewIndexIteratorLeak) {
-  // A regression test to avoid data race described in
-  // https://github.com/facebook/rocksdb/issues/1267
-  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
-  std::vector<std::string> keys;
-  stl_wrappers::KVMap kvmap;
-  c.Add("a1", "val1");
-  Options options;
-  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
-  table_options.index_type = BlockBasedTableOptions::kHashSearch;
-  table_options.cache_index_and_filter_blocks = true;
-  table_options.block_cache = NewLRUCache(0);
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  const ImmutableCFOptions ioptions(options);
-  const MutableCFOptions moptions(options);
-  c.Finish(options, ioptions, moptions, table_options,
-           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
-
-  rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
-      {
-          {"BlockBasedTable::NewIndexIterator::thread1:1",
-           "BlockBasedTable::NewIndexIterator::thread2:2"},
-          {"BlockBasedTable::NewIndexIterator::thread2:3",
-           "BlockBasedTable::NewIndexIterator::thread1:4"},
-      },
-      {
-          {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker",
-           "BlockBasedTable::NewIndexIterator::thread1:1"},
-          {"BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker",
-           "BlockBasedTable::NewIndexIterator::thread1:4"},
-          {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker",
-           "BlockBasedTable::NewIndexIterator::thread2:2"},
-          {"BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker",
-           "BlockBasedTable::NewIndexIterator::thread2:3"},
-      });
-
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  ReadOptions ro;
-  auto* reader = c.GetTableReader();
-
-  std::function<void()> func1 = [&]() {
-    TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker");
-    // TODO(Zhongyi): update test to use MutableCFOptions
-    std::unique_ptr<InternalIterator> iter(
-        reader->NewIterator(ro, moptions.prefix_extractor.get()));
-    iter->Seek(InternalKey("a1", 0, kTypeValue).Encode());
-  };
-
-  std::function<void()> func2 = [&]() {
-    TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker");
-    std::unique_ptr<InternalIterator> iter(
-        reader->NewIterator(ro, moptions.prefix_extractor.get()));
-  };
-
-  auto thread1 = port::Thread(func1);
-  auto thread2 = port::Thread(func2);
-  thread1.join();
-  thread2.join();
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  c.ResetTableReader();
-}
-
 // Plain table is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
 TEST_F(PlainTableTest, BasicPlainTableProperties) {
@@ -2912,7 +3304,8 @@ TEST_F(MemTableTest, Simple) {
   batch.DeleteRange(std::string("begin"), std::string("end"));
   ColumnFamilyMemTablesDefault cf_mems_default(memtable);
   ASSERT_TRUE(
-      WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr).ok());
+      WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr)
+          .ok());
 
   for (int i = 0; i < 2; ++i) {
     Arena arena;
@@ -3156,8 +3549,9 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
            &kvmap);
   auto reader = c.GetTableReader();
 
-  std::unique_ptr<InternalIterator> db_iter(
-      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
+  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
+      ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // Test point lookup
   for (auto& kv : kvmap) {
@@ -3349,13 +3743,14 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
                            EnvOptions(), ikc),
         std::move(file_reader), ss_rw.contents().size(), &table_reader);
 
-    return table_reader->NewIterator(ReadOptions(),
-                                     moptions.prefix_extractor.get());
+    return table_reader->NewIterator(
+        ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized);
   };
 
   GetVersionAndGlobalSeqno();
-  ASSERT_EQ(2, version);
-  ASSERT_EQ(0, global_seqno);
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(0u, global_seqno);
 
   InternalIterator* iter = GetTableInternalIter();
   char current_c = 'a';
@@ -3375,8 +3770,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   // Update global sequence number to 10
   SetGlobalSeqno(10);
   GetVersionAndGlobalSeqno();
-  ASSERT_EQ(2, version);
-  ASSERT_EQ(10, global_seqno);
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(10u, global_seqno);
 
   iter = GetTableInternalIter();
   current_c = 'a';
@@ -3412,8 +3807,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   // Update global sequence number to 3
   SetGlobalSeqno(3);
   GetVersionAndGlobalSeqno();
-  ASSERT_EQ(2, version);
-  ASSERT_EQ(3, global_seqno);
+  ASSERT_EQ(2u, version);
+  ASSERT_EQ(3u, global_seqno);
 
   iter = GetTableInternalIter();
   current_c = 'a';
@@ -3521,7 +3916,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
       std::move(file_reader), ss_rw.contents().size(), &table_reader));
 
   std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
-      ReadOptions(), moptions2.prefix_extractor.get()));
+      ReadOptions(), moptions2.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   int expected_key = 1;
   for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
@@ -3590,7 +3986,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     ASSERT_OK(ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
                                  &footer, kBlockBasedTableMagicNumber));
 
-    auto BlockFetchHelper = [&](const BlockHandle& handle,
+    auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type,
                                 BlockContents* contents) {
       ReadOptions read_options;
       read_options.verify_checksums = false;
@@ -3599,8 +3995,8 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
       BlockFetcher block_fetcher(
           file, nullptr /* prefetch_buffer */, footer, read_options, handle,
           contents, ioptions, false /* decompress */,
-          false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-          cache_options);
+          false /*maybe_compressed*/, block_type,
+          UncompressionDict::GetEmptyDict(), cache_options);
 
       ASSERT_OK(block_fetcher.ReadBlockContents());
     };
@@ -3609,13 +4005,13 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     auto metaindex_handle = footer.metaindex_handle();
     BlockContents metaindex_contents;
 
-    BlockFetchHelper(metaindex_handle, &metaindex_contents);
+    BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
+                     &metaindex_contents);
     Block metaindex_block(std::move(metaindex_contents),
                           kDisableGlobalSequenceNumber);
 
-    std::unique_ptr<InternalIterator> meta_iter(
-        metaindex_block.NewIterator<DataBlockIter>(BytewiseComparator(),
-                                                   BytewiseComparator()));
+    std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
+        BytewiseComparator(), BytewiseComparator()));
     bool found_properties_block = true;
     ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block));
     ASSERT_TRUE(found_properties_block);
@@ -3626,11 +4022,12 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     ASSERT_OK(properties_handle.DecodeFrom(&v));
     BlockContents properties_contents;
 
-    BlockFetchHelper(properties_handle, &properties_contents);
+    BlockFetchHelper(properties_handle, BlockType::kProperties,
+                     &properties_contents);
     Block properties_block(std::move(properties_contents),
                            kDisableGlobalSequenceNumber);
 
-    ASSERT_EQ(properties_block.NumRestarts(), 1);
+    ASSERT_EQ(properties_block.NumRestarts(), 1u);
   }
 }
 
@@ -3685,16 +4082,16 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
   BlockFetcher block_fetcher(
       table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
-      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
-      pcache_opts, nullptr /*memory_allocator*/);
+      false /*maybe_compressed*/, BlockType::kMetaIndex,
+      UncompressionDict::GetEmptyDict(), pcache_opts,
+      nullptr /*memory_allocator*/);
   ASSERT_OK(block_fetcher.ReadBlockContents());
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);
 
   // verify properties block comes last
   std::unique_ptr<InternalIterator> metaindex_iter{
-      metaindex_block.NewIterator<DataBlockIter>(options.comparator,
-                                                 options.comparator)};
+      metaindex_block.NewDataIterator(options.comparator, options.comparator)};
   uint64_t max_offset = 0;
   std::string key_at_max_offset;
   for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
@@ -3812,8 +4209,9 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
   auto reader = c.GetTableReader();
 
   std::unique_ptr<InternalIterator> seek_iter;
-  seek_iter.reset(
-      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
+  seek_iter.reset(reader->NewIterator(
+      ReadOptions(), moptions.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
   for (int i = 0; i < 2; ++i) {
     ReadOptions ro;
     // for every kv, we seek using two method: Get() and Seek()
@@ -3837,7 +4235,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
         std::string user_key = ExtractUserKey(kv.first).ToString();
         GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                                GetContext::kNotFound, user_key, &value, nullptr,
-                               nullptr, nullptr, nullptr);
+                               nullptr, true, nullptr, nullptr);
         ASSERT_OK(reader->Get(ro, kv.first, &get_context,
                               moptions.prefix_extractor.get()));
         ASSERT_EQ(get_context.State(), GetContext::kFound);
@@ -3863,7 +4261,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
         PinnableSlice value;
         GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                                GetContext::kNotFound, user_key, &value, nullptr,
-                               nullptr, nullptr, nullptr);
+                               nullptr, true, nullptr, nullptr);
         ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
                               moptions.prefix_extractor.get()));
         ASSERT_EQ(get_context.State(), GetContext::kNotFound);
@@ -3894,13 +4292,15 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) {
   Slice upper_bound_slice(upper_bound);
   read_opt.iterate_upper_bound = &upper_bound_slice;
   std::unique_ptr<InternalIterator> iter;
-  iter.reset(new KeyConvertingIterator(
-      reader->NewIterator(read_opt, nullptr /*prefix_extractor*/)));
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->SeekToFirst();
   ASSERT_FALSE(iter->Valid());
   ASSERT_TRUE(iter->IsOutOfBound());
-  iter.reset(new KeyConvertingIterator(
-      reader->NewIterator(read_opt, nullptr /*prefix_extractor*/)));
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->Seek("foo");
   ASSERT_FALSE(iter->Valid());
   ASSERT_TRUE(iter->IsOutOfBound());
@@ -3930,8 +4330,9 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
   Slice ub_slice1(ub1);
   read_opt.iterate_upper_bound = &ub_slice1;
   std::unique_ptr<InternalIterator> iter;
-  iter.reset(new KeyConvertingIterator(
-      reader->NewIterator(read_opt, nullptr /*prefix_extractor*/)));
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->Seek("bar");
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ("bar", iter->key());
@@ -3941,8 +4342,9 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
   std::string ub2 = "foo_after";
   Slice ub_slice2(ub2);
   read_opt.iterate_upper_bound = &ub_slice2;
-  iter.reset(new KeyConvertingIterator(
-      reader->NewIterator(read_opt, nullptr /*prefix_extractor*/)));
+  iter.reset(new KeyConvertingIterator(reader->NewIterator(
+      read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized)));
   iter->Seek("foo");
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ("foo", iter->key());
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
index a8f617dee29..1cb00b63928 100644
--- a/table/two_level_iterator.cc
+++ b/table/two_level_iterator.cc
@@ -9,21 +9,21 @@
 
 #include "table/two_level_iterator.h"
 #include "db/pinned_iterators_manager.h"
+#include "memory/arena.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "table/block.h"
+#include "table/block_based/block.h"
 #include "table/format.h"
-#include "util/arena.h"
 
 namespace rocksdb {
 
 namespace {
 
-class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
+class TwoLevelIndexIterator : public InternalIteratorBase<IndexValue> {
  public:
   explicit TwoLevelIndexIterator(
       TwoLevelIteratorState* state,
-      InternalIteratorBase<BlockHandle>* first_level_iter);
+      InternalIteratorBase<IndexValue>* first_level_iter);
 
   ~TwoLevelIndexIterator() override {
     first_level_iter_.DeleteIter(false /* is_arena_mode */);
@@ -43,7 +43,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
     assert(Valid());
     return second_level_iter_.key();
   }
-  BlockHandle value() const override {
+  IndexValue value() const override {
     assert(Valid());
     return second_level_iter_.value();
   }
@@ -69,12 +69,12 @@ class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
   }
   void SkipEmptyDataBlocksForward();
   void SkipEmptyDataBlocksBackward();
-  void SetSecondLevelIterator(InternalIteratorBase<BlockHandle>* iter);
+  void SetSecondLevelIterator(InternalIteratorBase<IndexValue>* iter);
   void InitDataBlock();
 
   TwoLevelIteratorState* state_;
-  IteratorWrapperBase<BlockHandle> first_level_iter_;
-  IteratorWrapperBase<BlockHandle> second_level_iter_;  // May be nullptr
+  IteratorWrapperBase<IndexValue> first_level_iter_;
+  IteratorWrapperBase<IndexValue> second_level_iter_;  // May be nullptr
   Status status_;
   // If second_level_iter is non-nullptr, then "data_block_handle_" holds the
   // "index_value" passed to block_function_ to create the second_level_iter.
@@ -83,7 +83,7 @@ class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
 
 TwoLevelIndexIterator::TwoLevelIndexIterator(
     TwoLevelIteratorState* state,
-    InternalIteratorBase<BlockHandle>* first_level_iter)
+    InternalIteratorBase<IndexValue>* first_level_iter)
     : state_(state), first_level_iter_(first_level_iter) {}
 
 void TwoLevelIndexIterator::Seek(const Slice& target) {
@@ -177,8 +177,8 @@ void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() {
 }
 
 void TwoLevelIndexIterator::SetSecondLevelIterator(
-    InternalIteratorBase<BlockHandle>* iter) {
-  InternalIteratorBase<BlockHandle>* old_iter = second_level_iter_.Set(iter);
+    InternalIteratorBase<IndexValue>* iter) {
+  InternalIteratorBase<IndexValue>* old_iter = second_level_iter_.Set(iter);
   delete old_iter;
 }
 
@@ -186,14 +186,14 @@ void TwoLevelIndexIterator::InitDataBlock() {
   if (!first_level_iter_.Valid()) {
     SetSecondLevelIterator(nullptr);
   } else {
-    BlockHandle handle = first_level_iter_.value();
+    BlockHandle handle = first_level_iter_.value().handle;
     if (second_level_iter_.iter() != nullptr &&
         !second_level_iter_.status().IsIncomplete() &&
         handle.offset() == data_block_handle_.offset()) {
       // second_level_iter is already constructed with this iterator, so
       // no need to change anything
     } else {
-      InternalIteratorBase<BlockHandle>* iter =
+      InternalIteratorBase<IndexValue>* iter =
           state_->NewSecondaryIterator(handle);
       data_block_handle_ = handle;
       SetSecondLevelIterator(iter);
@@ -203,9 +203,9 @@ void TwoLevelIndexIterator::InitDataBlock() {
 
 }  // namespace
 
-InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
+InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
     TwoLevelIteratorState* state,
-    InternalIteratorBase<BlockHandle>* first_level_iter) {
+    InternalIteratorBase<IndexValue>* first_level_iter) {
   return new TwoLevelIndexIterator(state, first_level_iter);
 }
 }  // namespace rocksdb
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
index 55d5c01a4ae..545c29f493e 100644
--- a/table/two_level_iterator.h
+++ b/table/two_level_iterator.h
@@ -22,11 +22,10 @@ struct TwoLevelIteratorState {
   TwoLevelIteratorState() {}
 
   virtual ~TwoLevelIteratorState() {}
-  virtual InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
+  virtual InternalIteratorBase<IndexValue>* NewSecondaryIterator(
       const BlockHandle& handle) = 0;
 };
 
-
 // Return a new two level iterator.  A two-level iterator contains an
 // index iterator whose values point to a sequence of blocks where
 // each block is itself a sequence of key,value pairs.  The returned
@@ -37,8 +36,8 @@ struct TwoLevelIteratorState {
 // Uses a supplied function to convert an index_iter value into
 // an iterator over the contents of the corresponding block.
 // Note: this function expects first_level_iter was not created using the arena
-extern InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
+extern InternalIteratorBase<IndexValue>* NewTwoLevelIterator(
     TwoLevelIteratorState* state,
-    InternalIteratorBase<BlockHandle>* first_level_iter);
+    InternalIteratorBase<IndexValue>* first_level_iter);
 
 }  // namespace rocksdb
diff --git a/util/fault_injection_test_env.cc b/test_util/fault_injection_test_env.cc
similarity index 84%
rename from util/fault_injection_test_env.cc
rename to test_util/fault_injection_test_env.cc
index 9cad23871b6..5c47b7ea455 100644
--- a/util/fault_injection_test_env.cc
+++ b/test_util/fault_injection_test_env.cc
@@ -11,7 +11,7 @@
 // the last "sync". It then checks for data loss errors by purposely dropping
 // file data (or entire files) not protected by a "sync".
 
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
 #include <functional>
 #include <utility>
 
@@ -98,6 +98,9 @@ Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const {
 }
 
 Status TestDirectory::Fsync() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
   env_->SyncDir(dirname_);
   return dir_->Fsync();
 }
@@ -158,6 +161,53 @@ Status TestWritableFile::Sync() {
   return Status::OK();
 }
 
+TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/,
+                                   std::unique_ptr<RandomRWFile>&& f,
+                                   FaultInjectionTestEnv* env)
+    : target_(std::move(f)), file_opened_(true), env_(env) {
+  assert(target_ != nullptr);
+}
+
+TestRandomRWFile::~TestRandomRWFile() {
+  if (file_opened_) {
+    Close();
+  }
+}
+
+Status TestRandomRWFile::Write(uint64_t offset, const Slice& data) {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Write(offset, data);
+}
+
+Status TestRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
+                              char* scratch) const {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Read(offset, n, result, scratch);
+}
+
+Status TestRandomRWFile::Close() {
+  file_opened_ = false;
+  return target_->Close();
+}
+
+Status TestRandomRWFile::Flush() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Flush();
+}
+
+Status TestRandomRWFile::Sync() {
+  if (!env_->IsFilesystemActive()) {
+    return env_->GetError();
+  }
+  return target_->Sync();
+}
+
 Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
                                            std::unique_ptr<Directory>* result) {
   std::unique_ptr<Directory> r;
@@ -220,6 +270,27 @@ Status FaultInjectionTestEnv::ReopenWritableFile(
   return s;
 }
 
+Status FaultInjectionTestEnv::NewRandomRWFile(
+    const std::string& fname, std::unique_ptr<RandomRWFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = target()->NewRandomRWFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestRandomRWFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
 Status FaultInjectionTestEnv::NewRandomAccessFile(
     const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
     const EnvOptions& soptions) {
@@ -238,7 +309,6 @@ Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
     fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(),
             s.ToString().c_str());
   }
-  assert(s.ok());
   if (s.ok()) {
     UntrackFile(f);
   }
diff --git a/util/fault_injection_test_env.h b/test_util/fault_injection_test_env.h
similarity index 85%
rename from util/fault_injection_test_env.h
rename to test_util/fault_injection_test_env.h
index a39e5b71e9d..b68b3faedce 100644
--- a/util/fault_injection_test_env.h
+++ b/test_util/fault_injection_test_env.h
@@ -19,9 +19,9 @@
 
 #include "db/version_set.h"
 #include "env/mock_env.h"
+#include "file/filename.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "util/filename.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 
@@ -82,6 +82,31 @@ class TestWritableFile : public WritableFile {
   FaultInjectionTestEnv* env_;
 };
 
+// A wrapper around WritableFileWriter* file
+// is written to or sync'ed.
+class TestRandomRWFile : public RandomRWFile {
+ public:
+  explicit TestRandomRWFile(const std::string& fname,
+                            std::unique_ptr<RandomRWFile>&& f,
+                            FaultInjectionTestEnv* env);
+  virtual ~TestRandomRWFile();
+  Status Write(uint64_t offset, const Slice& data) override;
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override;
+  Status Close() override;
+  Status Flush() override;
+  Status Sync() override;
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); };
+
+ private:
+  std::unique_ptr<RandomRWFile> target_;
+  bool file_opened_;
+  FaultInjectionTestEnv* env_;
+};
+
 class TestDirectory : public Directory {
  public:
   explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname,
@@ -114,6 +139,10 @@ class FaultInjectionTestEnv : public EnvWrapper {
                             std::unique_ptr<WritableFile>* result,
                             const EnvOptions& soptions) override;
 
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& soptions) override;
+
   Status NewRandomAccessFile(const std::string& fname,
                              std::unique_ptr<RandomAccessFile>* result,
                              const EnvOptions& soptions) override;
diff --git a/util/mock_time_env.h b/test_util/mock_time_env.h
similarity index 100%
rename from util/mock_time_env.h
rename to test_util/mock_time_env.h
diff --git a/util/sync_point.cc b/test_util/sync_point.cc
similarity index 95%
rename from util/sync_point.cc
rename to test_util/sync_point.cc
index 4599c256d9f..a09be9e8fa1 100644
--- a/util/sync_point.cc
+++ b/test_util/sync_point.cc
@@ -3,8 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/sync_point.h"
-#include "util/sync_point_impl.h"
+#include "test_util/sync_point.h"
+#include "test_util/sync_point_impl.h"
 
 int rocksdb_kill_odds = 0;
 std::vector<std::string> rocksdb_kill_prefix_blacklist;
diff --git a/util/sync_point.h b/test_util/sync_point.h
similarity index 100%
rename from util/sync_point.h
rename to test_util/sync_point.h
diff --git a/util/sync_point_impl.cc b/test_util/sync_point_impl.cc
similarity index 98%
rename from util/sync_point_impl.cc
rename to test_util/sync_point_impl.cc
index 248c381a328..db44f472a05 100644
--- a/util/sync_point_impl.cc
+++ b/test_util/sync_point_impl.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/sync_point_impl.h"
+#include "test_util/sync_point_impl.h"
 
 #ifndef NDEBUG
 namespace rocksdb {
diff --git a/util/sync_point_impl.h b/test_util/sync_point_impl.h
similarity index 98%
rename from util/sync_point_impl.h
rename to test_util/sync_point_impl.h
index 3c7e7049183..d96d7325786 100644
--- a/util/sync_point_impl.h
+++ b/test_util/sync_point_impl.h
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 #include <assert.h>
 #include <atomic>
diff --git a/util/testharness.cc b/test_util/testharness.cc
similarity index 97%
rename from util/testharness.cc
rename to test_util/testharness.cc
index 8f5eb2a4d6e..62cc535a198 100644
--- a/util/testharness.cc
+++ b/test_util/testharness.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include <string>
 #include <thread>
 
diff --git a/util/testharness.h b/test_util/testharness.h
similarity index 100%
rename from util/testharness.h
rename to test_util/testharness.h
diff --git a/util/testutil.cc b/test_util/testutil.cc
similarity index 90%
rename from util/testutil.cc
rename to test_util/testutil.cc
index b6493258f60..7c90c14efee 100644
--- a/util/testutil.cc
+++ b/test_util/testutil.cc
@@ -7,20 +7,24 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "util/testutil.h"
+#include "test_util/testutil.h"
 
+#include <array>
 #include <cctype>
+#include <fstream>
 #include <sstream>
 
 #include "db/memtable_list.h"
+#include "file/random_access_file_reader.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
 #include "port/port.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 namespace test {
 
 const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
-const uint32_t kLatestFormatVersion = 4u;
+const uint32_t kLatestFormatVersion = 5u;
 
 Slice RandomString(Random* rnd, int len, std::string* dst) {
   dst->resize(len);
@@ -162,7 +166,11 @@ std::string RandomName(Random* rnd, const size_t len) {
 }
 
 CompressionType RandomCompressionType(Random* rnd) {
-  return static_cast<CompressionType>(rnd->Uniform(6));
+  auto ret = static_cast<CompressionType>(rnd->Uniform(6));
+  while (!CompressionTypeSupported(ret)) {
+    ret = static_cast<CompressionType>((static_cast<int>(ret) + 1) % 6);
+  }
+  return ret;
 }
 
 void RandomCompressionTypeVector(const size_t count,
@@ -193,8 +201,12 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) {
   opt.cache_index_and_filter_blocks = rnd->Uniform(2);
   opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2);
   opt.pin_top_level_index_and_filter = rnd->Uniform(2);
-  opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch
-                                   : BlockBasedTableOptions::kHashSearch;
+  using IndexType = BlockBasedTableOptions::IndexType;
+  const std::array<IndexType, 4> index_types = {
+      {IndexType::kBinarySearch, IndexType::kHashSearch,
+       IndexType::kTwoLevelIndexSearch, IndexType::kBinarySearchWithFirstKey}};
+  opt.index_type =
+      index_types[rnd->Uniform(static_cast<int>(index_types.size()))];
   opt.hash_index_allow_collision = rnd->Uniform(2);
   opt.checksum = static_cast<ChecksumType>(rnd->Uniform(3));
   opt.block_size = rnd->Uniform(10000000);
@@ -293,7 +305,8 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
   db_opt->stats_dump_period_sec = rnd->Uniform(100000);
 }
 
-void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
+void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
+                         Random* rnd) {
   cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4));
 
   // boolean options
@@ -322,6 +335,7 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
   cf_opt->max_mem_compaction_level = rnd->Uniform(100);
   cf_opt->max_write_buffer_number = rnd->Uniform(100);
   cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100);
+  cf_opt->max_write_buffer_size_to_maintain = rnd->Uniform(10000);
   cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100);
   cf_opt->num_levels = rnd->Uniform(100);
   cf_opt->target_file_size_multiplier = rnd->Uniform(100);
@@ -345,8 +359,10 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
 
   // uint64_t options
   static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
-  cf_opt->ttl = uint_max + rnd->Uniform(10000);
-  cf_opt->periodic_compaction_seconds = uint_max + rnd->Uniform(10000);
+  cf_opt->ttl =
+      db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
+  cf_opt->periodic_compaction_seconds =
+      db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
   cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000);
   cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000);
   cf_opt->max_compaction_bytes =
@@ -414,5 +430,22 @@ bool IsDirectIOSupported(Env* env, const std::string& dir) {
   return s.ok();
 }
 
+size_t GetLinesCount(const std::string& fname, const std::string& pattern) {
+  std::stringstream ssbuf;
+  std::string line;
+  size_t count = 0;
+
+  std::ifstream inFile(fname.c_str());
+  ssbuf << inFile.rdbuf();
+
+  while (getline(ssbuf, line)) {
+    if (line.find(pattern) != std::string::npos) {
+      count++;
+    }
+  }
+
+  return count;
+}
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/util/testutil.h b/test_util/testutil.h
similarity index 81%
rename from util/testutil.h
rename to test_util/testutil.h
index 2aab3df72c4..716ae7d26e8 100644
--- a/util/testutil.h
+++ b/test_util/testutil.h
@@ -20,9 +20,9 @@
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/table.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/internal_iterator.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 
@@ -492,13 +492,11 @@ inline std::string EncodeInt(uint64_t x) {
   return result;
 }
 
-class StringEnv : public EnvWrapper {
- public:
   class SeqStringSource : public SequentialFile {
    public:
     explicit SeqStringSource(const std::string& data)
         : data_(data), offset_(0) {}
-    ~SeqStringSource() {}
+    ~SeqStringSource() override {}
     Status Read(size_t n, Slice* result, char* scratch) override {
       std::string output;
       if (offset_ < data_.size()) {
@@ -527,129 +525,136 @@ class StringEnv : public EnvWrapper {
     size_t offset_;
   };
 
-  class StringSink : public WritableFile {
+  class StringEnv : public EnvWrapper {
    public:
-    explicit StringSink(std::string* contents)
-        : WritableFile(), contents_(contents) {}
-    virtual Status Truncate(uint64_t size) override {
-      contents_->resize(static_cast<size_t>(size));
-      return Status::OK();
-    }
-    virtual Status Close() override { return Status::OK(); }
-    virtual Status Flush() override { return Status::OK(); }
-    virtual Status Sync() override { return Status::OK(); }
-    virtual Status Append(const Slice& slice) override {
-      contents_->append(slice.data(), slice.size());
-      return Status::OK();
-    }
+    class StringSink : public WritableFile {
+     public:
+      explicit StringSink(std::string* contents)
+          : WritableFile(), contents_(contents) {}
+      virtual Status Truncate(uint64_t size) override {
+        contents_->resize(static_cast<size_t>(size));
+        return Status::OK();
+      }
+      virtual Status Close() override { return Status::OK(); }
+      virtual Status Flush() override { return Status::OK(); }
+      virtual Status Sync() override { return Status::OK(); }
+      virtual Status Append(const Slice& slice) override {
+        contents_->append(slice.data(), slice.size());
+        return Status::OK();
+      }
 
-   private:
-    std::string* contents_;
-  };
+     private:
+      std::string* contents_;
+    };
 
-  explicit StringEnv(Env* t) : EnvWrapper(t) {}
-  virtual ~StringEnv() {}
+    explicit StringEnv(Env* t) : EnvWrapper(t) {}
+    ~StringEnv() override {}
 
-  const std::string& GetContent(const std::string& f) { return files_[f]; }
+    const std::string& GetContent(const std::string& f) { return files_[f]; }
 
-  const Status WriteToNewFile(const std::string& file_name,
-                              const std::string& content) {
-    std::unique_ptr<WritableFile> r;
-    auto s = NewWritableFile(file_name, &r, EnvOptions());
-    if (!s.ok()) {
-      return s;
+    const Status WriteToNewFile(const std::string& file_name,
+                                const std::string& content) {
+      std::unique_ptr<WritableFile> r;
+      auto s = NewWritableFile(file_name, &r, EnvOptions());
+      if (!s.ok()) {
+        return s;
+      }
+      r->Append(content);
+      r->Flush();
+      r->Close();
+      assert(files_[file_name] == content);
+      return Status::OK();
     }
-    r->Append(content);
-    r->Flush();
-    r->Close();
-    assert(files_[file_name] == content);
-    return Status::OK();
-  }
 
-  // The following text is boilerplate that forwards all methods to target()
-  Status NewSequentialFile(const std::string& f,
-                           std::unique_ptr<SequentialFile>* r,
+    // The following text is boilerplate that forwards all methods to target()
+    Status NewSequentialFile(const std::string& f,
+                             std::unique_ptr<SequentialFile>* r,
+                             const EnvOptions& /*options*/) override {
+      auto iter = files_.find(f);
+      if (iter == files_.end()) {
+        return Status::NotFound("The specified file does not exist", f);
+      }
+      r->reset(new SeqStringSource(iter->second));
+      return Status::OK();
+    }
+    Status NewRandomAccessFile(const std::string& /*f*/,
+                               std::unique_ptr<RandomAccessFile>* /*r*/,
+                               const EnvOptions& /*options*/) override {
+      return Status::NotSupported();
+    }
+    Status NewWritableFile(const std::string& f,
+                           std::unique_ptr<WritableFile>* r,
                            const EnvOptions& /*options*/) override {
-    auto iter = files_.find(f);
-    if (iter == files_.end()) {
-      return Status::NotFound("The specified file does not exist", f);
+      auto iter = files_.find(f);
+      if (iter != files_.end()) {
+        return Status::IOError("The specified file already exists", f);
+      }
+      r->reset(new StringSink(&files_[f]));
+      return Status::OK();
     }
-    r->reset(new SeqStringSource(iter->second));
-    return Status::OK();
-  }
-  Status NewRandomAccessFile(const std::string& /*f*/,
-                             std::unique_ptr<RandomAccessFile>* /*r*/,
-                             const EnvOptions& /*options*/) override {
-    return Status::NotSupported();
-  }
-  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
-                         const EnvOptions& /*options*/) override {
-    auto iter = files_.find(f);
-    if (iter != files_.end()) {
-      return Status::IOError("The specified file already exists", f);
+    virtual Status NewDirectory(
+        const std::string& /*name*/,
+        std::unique_ptr<Directory>* /*result*/) override {
+      return Status::NotSupported();
     }
-    r->reset(new StringSink(&files_[f]));
-    return Status::OK();
-  }
-  virtual Status NewDirectory(const std::string& /*name*/,
-                              std::unique_ptr<Directory>* /*result*/) override {
-    return Status::NotSupported();
-  }
-  Status FileExists(const std::string& f) override {
-    if (files_.find(f) == files_.end()) {
-      return Status::NotFound();
+    Status FileExists(const std::string& f) override {
+      if (files_.find(f) == files_.end()) {
+        return Status::NotFound();
+      }
+      return Status::OK();
     }
-    return Status::OK();
-  }
-  Status GetChildren(const std::string& /*dir*/,
-                     std::vector<std::string>* /*r*/) override {
-    return Status::NotSupported();
-  }
-  Status DeleteFile(const std::string& f) override {
-    files_.erase(f);
-    return Status::OK();
-  }
-  Status CreateDir(const std::string& /*d*/) override {
-    return Status::NotSupported();
-  }
-  Status CreateDirIfMissing(const std::string& /*d*/) override {
-    return Status::NotSupported();
-  }
-  Status DeleteDir(const std::string& /*d*/) override {
-    return Status::NotSupported();
-  }
-  Status GetFileSize(const std::string& f, uint64_t* s) override {
-    auto iter = files_.find(f);
-    if (iter == files_.end()) {
-      return Status::NotFound("The specified file does not exist:", f);
+    Status GetChildren(const std::string& /*dir*/,
+                       std::vector<std::string>* /*r*/) override {
+      return Status::NotSupported();
+    }
+    Status DeleteFile(const std::string& f) override {
+      files_.erase(f);
+      return Status::OK();
+    }
+    Status CreateDir(const std::string& /*d*/) override {
+      return Status::NotSupported();
+    }
+    Status CreateDirIfMissing(const std::string& /*d*/) override {
+      return Status::NotSupported();
+    }
+    Status DeleteDir(const std::string& /*d*/) override {
+      return Status::NotSupported();
+    }
+    Status GetFileSize(const std::string& f, uint64_t* s) override {
+      auto iter = files_.find(f);
+      if (iter == files_.end()) {
+        return Status::NotFound("The specified file does not exist:", f);
+      }
+      *s = iter->second.size();
+      return Status::OK();
     }
-    *s = iter->second.size();
-    return Status::OK();
-  }
 
-  Status GetFileModificationTime(const std::string& /*fname*/,
-                                 uint64_t* /*file_mtime*/) override {
-    return Status::NotSupported();
-  }
+    Status GetFileModificationTime(const std::string& /*fname*/,
+                                   uint64_t* /*file_mtime*/) override {
+      return Status::NotSupported();
+    }
 
-  Status RenameFile(const std::string& /*s*/,
-                    const std::string& /*t*/) override {
-    return Status::NotSupported();
-  }
+    Status RenameFile(const std::string& /*s*/,
+                      const std::string& /*t*/) override {
+      return Status::NotSupported();
+    }
 
-  Status LinkFile(const std::string& /*s*/, const std::string& /*t*/) override {
-    return Status::NotSupported();
-  }
+    Status LinkFile(const std::string& /*s*/,
+                    const std::string& /*t*/) override {
+      return Status::NotSupported();
+    }
 
-  Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override {
-    return Status::NotSupported();
-  }
+    Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override {
+      return Status::NotSupported();
+    }
 
-  Status UnlockFile(FileLock* /*l*/) override { return Status::NotSupported(); }
+    Status UnlockFile(FileLock* /*l*/) override {
+      return Status::NotSupported();
+    }
 
- protected:
-  std::unordered_map<std::string, std::string> files_;
-};
+   protected:
+    std::unordered_map<std::string, std::string> files_;
+  };
 
 // Randomly initialize the given DBOptions
 void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
@@ -657,7 +662,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
 // Randomly initialize the given ColumnFamilyOptions
 // Note that the caller is responsible for releasing non-null
 // cf_opt->compaction_filter.
-void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd);
+void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions&, Random* rnd);
 
 // A dummy merge operator which can change its name
 class ChanglingMergeOperator : public MergeOperator {
@@ -750,5 +755,8 @@ Status DestroyDir(Env* env, const std::string& dir);
 
 bool IsDirectIOSupported(Env* env, const std::string& dir);
 
+// Return the number of lines where a given pattern was found in a file.
+size_t GetLinesCount(const std::string& fname, const std::string& pattern);
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/util/transaction_test_util.cc b/test_util/transaction_test_util.cc
similarity index 97%
rename from util/transaction_test_util.cc
rename to test_util/transaction_test_util.cc
index 30cff11e14d..7043eb2d789 100644
--- a/util/transaction_test_util.cc
+++ b/test_util/transaction_test_util.cc
@@ -4,14 +4,10 @@
 //  (found in the LICENSE.Apache file in the root directory).
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
+#include "test_util/transaction_test_util.h"
 
-#include "util/transaction_test_util.h"
-
-#include <inttypes.h>
 #include <algorithm>
+#include <cinttypes>
 #include <numeric>
 #include <random>
 #include <string>
@@ -24,7 +20,7 @@
 
 #include "db/dbformat.h"
 #include "db/snapshot_impl.h"
-#include "util/logging.h"
+#include "logging/logging.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
@@ -205,6 +201,12 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
         ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
                         "Prepare of %" PRIu64 " %s (%s)", txn->GetId(),
                         s.ToString().c_str(), txn->GetName().c_str());
+        if (rand_->OneIn(20)) {
+          // This currently only tests the mechanics of writing commit time
+          // write batch so the exact values would not matter.
+          s = txn_->GetCommitTimeWriteBatch()->Put("cat", "dog");
+          assert(s.ok());
+        }
         db->GetDBOptions().env->SleepForMicroseconds(
             static_cast<int>(cmt_delay_ms_ * 1000));
       }
diff --git a/util/transaction_test_util.h b/test_util/transaction_test_util.h
similarity index 100%
rename from util/transaction_test_util.h
rename to test_util/transaction_test_util.h
diff --git a/third-party/folly/folly/CPortability.h b/third-party/folly/folly/CPortability.h
new file mode 100644
index 00000000000..56cb6b1a58c
--- /dev/null
+++ b/third-party/folly/folly/CPortability.h
@@ -0,0 +1,27 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+/**
+ * Macro for marking functions as having public visibility.
+ */
+#if defined(__GNUC__)
+#define FOLLY_EXPORT __attribute__((__visibility__("default")))
+#else
+#define FOLLY_EXPORT
+#endif
+
+#if defined(__has_feature)
+#define FOLLY_HAS_FEATURE(...) __has_feature(__VA_ARGS__)
+#else
+#define FOLLY_HAS_FEATURE(...) 0
+#endif
+
+#if FOLLY_HAS_FEATURE(thread_sanitizer) || __SANITIZE_THREAD__
+#ifndef FOLLY_SANITIZE_THREAD
+#define FOLLY_SANITIZE_THREAD 1
+#endif
+#endif
diff --git a/third-party/folly/folly/ConstexprMath.h b/third-party/folly/folly/ConstexprMath.h
new file mode 100644
index 00000000000..f09167e0d42
--- /dev/null
+++ b/third-party/folly/folly/ConstexprMath.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace folly {
+template <typename T>
+constexpr T constexpr_max(T a) {
+  return a;
+}
+template <typename T, typename... Ts>
+constexpr T constexpr_max(T a, T b, Ts... ts) {
+  return b < a ? constexpr_max(a, ts...) : constexpr_max(b, ts...);
+}
+
+namespace detail {
+template <typename T>
+constexpr T constexpr_log2_(T a, T e) {
+  return e == T(1) ? a : constexpr_log2_(a + T(1), e / T(2));
+}
+
+template <typename T>
+constexpr T constexpr_log2_ceil_(T l2, T t) {
+  return l2 + T(T(1) << l2 < t ? 1 : 0);
+}
+
+template <typename T>
+constexpr T constexpr_square_(T t) {
+  return t * t;
+}
+}  // namespace detail
+
+template <typename T>
+constexpr T constexpr_log2(T t) {
+  return detail::constexpr_log2_(T(0), t);
+}
+
+template <typename T>
+constexpr T constexpr_log2_ceil(T t) {
+  return detail::constexpr_log2_ceil_(constexpr_log2(t), t);
+}
+
+} // namespace folly
diff --git a/third-party/folly/folly/Indestructible.h b/third-party/folly/folly/Indestructible.h
new file mode 100644
index 00000000000..68249d86512
--- /dev/null
+++ b/third-party/folly/folly/Indestructible.h
@@ -0,0 +1,166 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <type_traits>
+#include <utility>
+
+#include <folly/Traits.h>
+
+namespace folly {
+
+/***
+ *  Indestructible
+ *
+ *  When you need a Meyers singleton that will not get destructed, even at
+ *  shutdown, and you also want the object stored inline.
+ *
+ *  Use like:
+ *
+ *      void doSomethingWithExpensiveData();
+ *
+ *      void doSomethingWithExpensiveData() {
+ *        static const Indestructible<map<string, int>> data{
+ *          map<string, int>{{"key1", 17}, {"key2", 19}, {"key3", 23}},
+ *        };
+ *        callSomethingTakingAMapByRef(*data);
+ *      }
+ *
+ *  This should be used only for Meyers singletons, and, even then, only when
+ *  the instance does not need to be destructed ever.
+ *
+ *  This should not be used more generally, e.g., as member fields, etc.
+ *
+ *  This is designed as an alternative, but with one fewer allocation at
+ *  construction time and one fewer pointer dereference at access time, to the
+ *  Meyers singleton pattern of:
+ *
+ *    void doSomethingWithExpensiveData() {
+ *      static const auto data =  // never `delete`d
+ *          new map<string, int>{{"key1", 17}, {"key2", 19}, {"key3", 23}};
+ *      callSomethingTakingAMapByRef(*data);
+ *    }
+ */
+
+template <typename T>
+class Indestructible final {
+ public:
+  template <typename S = T, typename = decltype(S())>
+  constexpr Indestructible() noexcept(noexcept(T())) {}
+
+  /**
+   * Constructor accepting a single argument by forwarding reference, this
+   * allows using list initialzation without the overhead of things like
+   * in_place, etc and also works with std::initializer_list constructors
+   * which can't be deduced, the default parameter helps there.
+   *
+   *    auto i = folly::Indestructible<std::map<int, int>>{{{1, 2}}};
+   *
+   * This provides convenience
+   *
+   * There are two versions of this constructor - one for when the element is
+   * implicitly constructible from the given argument and one for when the
+   * type is explicitly but not implicitly constructible from the given
+   * argument.
+   */
+  template <
+      typename U = T,
+      _t<std::enable_if<std::is_constructible<T, U&&>::value>>* = nullptr,
+      _t<std::enable_if<
+          !std::is_same<Indestructible<T>, remove_cvref_t<U>>::value>>* =
+          nullptr,
+      _t<std::enable_if<!std::is_convertible<U&&, T>::value>>* = nullptr>
+  explicit constexpr Indestructible(U&& u) noexcept(
+      noexcept(T(std::declval<U>())))
+      : storage_(std::forward<U>(u)) {}
+  template <
+      typename U = T,
+      _t<std::enable_if<std::is_constructible<T, U&&>::value>>* = nullptr,
+      _t<std::enable_if<
+          !std::is_same<Indestructible<T>, remove_cvref_t<U>>::value>>* =
+          nullptr,
+      _t<std::enable_if<std::is_convertible<U&&, T>::value>>* = nullptr>
+  /* implicit */ constexpr Indestructible(U&& u) noexcept(
+      noexcept(T(std::declval<U>())))
+      : storage_(std::forward<U>(u)) {}
+
+  template <typename... Args, typename = decltype(T(std::declval<Args>()...))>
+  explicit constexpr Indestructible(Args&&... args) noexcept(
+      noexcept(T(std::declval<Args>()...)))
+      : storage_(std::forward<Args>(args)...) {}
+  template <
+      typename U,
+      typename... Args,
+      typename = decltype(
+          T(std::declval<std::initializer_list<U>&>(),
+            std::declval<Args>()...))>
+  explicit constexpr Indestructible(std::initializer_list<U> il, Args... args) noexcept(
+      noexcept(
+          T(std::declval<std::initializer_list<U>&>(),
+            std::declval<Args>()...)))
+      : storage_(il, std::forward<Args>(args)...) {}
+
+  ~Indestructible() = default;
+
+  Indestructible(Indestructible const&) = delete;
+  Indestructible& operator=(Indestructible const&) = delete;
+
+  Indestructible(Indestructible&& other) noexcept(
+      noexcept(T(std::declval<T>())))
+      : storage_(std::move(other.storage_.value)) {
+    other.erased_ = true;
+  }
+  Indestructible& operator=(Indestructible&& other) noexcept(
+      noexcept(T(std::declval<T>()))) {
+    storage_.value = std::move(other.storage_.value);
+    other.erased_ = true;
+  }
+
+  T* get() noexcept {
+    check();
+    return &storage_.value;
+  }
+  T const* get() const noexcept {
+    check();
+    return &storage_.value;
+  }
+  T& operator*() noexcept {
+    return *get();
+  }
+  T const& operator*() const noexcept {
+    return *get();
+  }
+  T* operator->() noexcept {
+    return get();
+  }
+  T const* operator->() const noexcept {
+    return get();
+  }
+
+ private:
+  void check() const noexcept {
+    assert(!erased_);
+  }
+
+  union Storage {
+    T value;
+
+    template <typename S = T, typename = decltype(S())>
+    constexpr Storage() noexcept(noexcept(T())) : value() {}
+
+    template <typename... Args, typename = decltype(T(std::declval<Args>()...))>
+    explicit constexpr Storage(Args&&... args) noexcept(
+        noexcept(T(std::declval<Args>()...)))
+        : value(std::forward<Args>(args)...) {}
+
+    ~Storage() {}
+  };
+
+  Storage storage_{};
+  bool erased_{false};
+};
+} // namespace folly
diff --git a/third-party/folly/folly/Optional.h b/third-party/folly/folly/Optional.h
new file mode 100644
index 00000000000..ee12467dda7
--- /dev/null
+++ b/third-party/folly/folly/Optional.h
@@ -0,0 +1,570 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+/*
+ * Optional - For conditional initialization of values, like boost::optional,
+ * but with support for move semantics and emplacement.  Reference type support
+ * has not been included due to limited use cases and potential confusion with
+ * semantics of assignment: Assigning to an optional reference could quite
+ * reasonably copy its value or redirect the reference.
+ *
+ * Optional can be useful when a variable might or might not be needed:
+ *
+ *  Optional<Logger> maybeLogger = ...;
+ *  if (maybeLogger) {
+ *    maybeLogger->log("hello");
+ *  }
+ *
+ * Optional enables a 'null' value for types which do not otherwise have
+ * nullability, especially useful for parameter passing:
+ *
+ * void testIterator(const unique_ptr<Iterator>& it,
+ *                   initializer_list<int> idsExpected,
+ *                   Optional<initializer_list<int>> ranksExpected = none) {
+ *   for (int i = 0; it->next(); ++i) {
+ *     EXPECT_EQ(it->doc().id(), idsExpected[i]);
+ *     if (ranksExpected) {
+ *       EXPECT_EQ(it->doc().rank(), (*ranksExpected)[i]);
+ *     }
+ *   }
+ * }
+ *
+ * Optional models OptionalPointee, so calling 'get_pointer(opt)' will return a
+ * pointer to nullptr if the 'opt' is empty, and a pointer to the value if it is
+ * not:
+ *
+ *  Optional<int> maybeInt = ...;
+ *  if (int* v = get_pointer(maybeInt)) {
+ *    cout << *v << endl;
+ *  }
+ */
+
+#include <cstddef>
+#include <functional>
+#include <new>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <folly/CPortability.h>
+#include <folly/Traits.h>
+#include <folly/Utility.h>
+
+namespace folly {
+
+template <class Value>
+class Optional;
+
+namespace detail {
+template <class Value>
+struct OptionalPromiseReturn;
+} // namespace detail
+
+struct None {
+  enum class _secret { _token };
+
+  /**
+   * No default constructor to support both `op = {}` and `op = none`
+   * as syntax for clearing an Optional, just like std::nullopt_t.
+   */
+  constexpr explicit None(_secret) {}
+};
+constexpr None none{None::_secret::_token};
+
+class FOLLY_EXPORT OptionalEmptyException : public std::runtime_error {
+ public:
+  OptionalEmptyException()
+      : std::runtime_error("Empty Optional cannot be unwrapped") {}
+};
+
+template <class Value>
+class Optional {
+ public:
+  typedef Value value_type;
+
+  static_assert(
+      !std::is_reference<Value>::value,
+      "Optional may not be used with reference types");
+  static_assert(
+      !std::is_abstract<Value>::value,
+      "Optional may not be used with abstract types");
+
+  Optional() noexcept {}
+
+  Optional(const Optional& src) noexcept(
+      std::is_nothrow_copy_constructible<Value>::value) {
+    if (src.hasValue()) {
+      construct(src.value());
+    }
+  }
+
+  Optional(Optional&& src) noexcept(
+      std::is_nothrow_move_constructible<Value>::value) {
+    if (src.hasValue()) {
+      construct(std::move(src.value()));
+      src.clear();
+    }
+  }
+
+  /* implicit */ Optional(const None&) noexcept {}
+
+  /* implicit */ Optional(Value&& newValue) noexcept(
+      std::is_nothrow_move_constructible<Value>::value) {
+    construct(std::move(newValue));
+  }
+
+  /* implicit */ Optional(const Value& newValue) noexcept(
+      std::is_nothrow_copy_constructible<Value>::value) {
+    construct(newValue);
+  }
+
+  template <typename... Args>
+  explicit Optional(in_place_t, Args&&... args) noexcept(
+      std::is_nothrow_constructible<Value, Args...>::value)
+      : Optional{PrivateConstructor{}, std::forward<Args>(args)...} {}
+
+  template <typename U, typename... Args>
+  explicit Optional(
+      in_place_t,
+      std::initializer_list<U> il,
+      Args&&... args) noexcept(std::
+                                   is_nothrow_constructible<
+                                       Value,
+                                       std::initializer_list<U>,
+                                       Args...>::value)
+      : Optional{PrivateConstructor{}, il, std::forward<Args>(args)...} {}
+
+  // Used only when an Optional is used with coroutines on MSVC
+  /* implicit */ Optional(const detail::OptionalPromiseReturn<Value>& p)
+      : Optional{} {
+    p.promise_->value_ = this;
+  }
+
+  void assign(const None&) {
+    clear();
+  }
+
+  void assign(Optional&& src) {
+    if (this != &src) {
+      if (src.hasValue()) {
+        assign(std::move(src.value()));
+        src.clear();
+      } else {
+        clear();
+      }
+    }
+  }
+
+  void assign(const Optional& src) {
+    if (src.hasValue()) {
+      assign(src.value());
+    } else {
+      clear();
+    }
+  }
+
+  void assign(Value&& newValue) {
+    if (hasValue()) {
+      storage_.value = std::move(newValue);
+    } else {
+      construct(std::move(newValue));
+    }
+  }
+
+  void assign(const Value& newValue) {
+    if (hasValue()) {
+      storage_.value = newValue;
+    } else {
+      construct(newValue);
+    }
+  }
+
+  Optional& operator=(None) noexcept {
+    reset();
+    return *this;
+  }
+
+  template <class Arg>
+  Optional& operator=(Arg&& arg) {
+    assign(std::forward<Arg>(arg));
+    return *this;
+  }
+
+  Optional& operator=(Optional&& other) noexcept(
+      std::is_nothrow_move_assignable<Value>::value) {
+    assign(std::move(other));
+    return *this;
+  }
+
+  Optional& operator=(const Optional& other) noexcept(
+      std::is_nothrow_copy_assignable<Value>::value) {
+    assign(other);
+    return *this;
+  }
+
+  template <class... Args>
+  Value& emplace(Args&&... args) {
+    clear();
+    construct(std::forward<Args>(args)...);
+    return value();
+  }
+
+  template <class U, class... Args>
+  typename std::enable_if<
+      std::is_constructible<Value, std::initializer_list<U>&, Args&&...>::value,
+      Value&>::type
+  emplace(std::initializer_list<U> ilist, Args&&... args) {
+    clear();
+    construct(ilist, std::forward<Args>(args)...);
+    return value();
+  }
+
+  void reset() noexcept {
+    storage_.clear();
+  }
+
+  void clear() noexcept {
+    reset();
+  }
+
+  void swap(Optional& that) noexcept(IsNothrowSwappable<Value>::value) {
+    if (hasValue() && that.hasValue()) {
+      using std::swap;
+      swap(value(), that.value());
+    } else if (hasValue()) {
+      that.emplace(std::move(value()));
+      reset();
+    } else if (that.hasValue()) {
+      emplace(std::move(that.value()));
+      that.reset();
+    }
+  }
+
+  const Value& value() const& {
+    require_value();
+    return storage_.value;
+  }
+
+  Value& value() & {
+    require_value();
+    return storage_.value;
+  }
+
+  Value&& value() && {
+    require_value();
+    return std::move(storage_.value);
+  }
+
+  const Value&& value() const&& {
+    require_value();
+    return std::move(storage_.value);
+  }
+
+  const Value* get_pointer() const& {
+    return storage_.hasValue ? &storage_.value : nullptr;
+  }
+  Value* get_pointer() & {
+    return storage_.hasValue ? &storage_.value : nullptr;
+  }
+  Value* get_pointer() && = delete;
+
+  bool has_value() const noexcept {
+    return storage_.hasValue;
+  }
+
+  bool hasValue() const noexcept {
+    return has_value();
+  }
+
+  explicit operator bool() const noexcept {
+    return has_value();
+  }
+
+  const Value& operator*() const& {
+    return value();
+  }
+  Value& operator*() & {
+    return value();
+  }
+  const Value&& operator*() const&& {
+    return std::move(value());
+  }
+  Value&& operator*() && {
+    return std::move(value());
+  }
+
+  const Value* operator->() const {
+    return &value();
+  }
+  Value* operator->() {
+    return &value();
+  }
+
+  // Return a copy of the value if set, or a given default if not.
+  template <class U>
+  Value value_or(U&& dflt) const& {
+    if (storage_.hasValue) {
+      return storage_.value;
+    }
+
+    return std::forward<U>(dflt);
+  }
+
+  template <class U>
+  Value value_or(U&& dflt) && {
+    if (storage_.hasValue) {
+      return std::move(storage_.value);
+    }
+
+    return std::forward<U>(dflt);
+  }
+
+ private:
+  template <class T>
+  friend Optional<_t<std::decay<T>>> make_optional(T&&);
+  template <class T, class... Args>
+  friend Optional<T> make_optional(Args&&... args);
+  template <class T, class U, class... As>
+  friend Optional<T> make_optional(std::initializer_list<U>, As&&...);
+
+  /**
+   * Construct the optional in place, this is duplicated as a non-explicit
+   * constructor to allow returning values that are non-movable from
+   * make_optional using list initialization.
+   *
+   * Until C++17, at which point this will become unnecessary because of
+   * specified prvalue elision.
+   */
+  struct PrivateConstructor {
+    explicit PrivateConstructor() = default;
+  };
+  template <typename... Args>
+  Optional(PrivateConstructor, Args&&... args) noexcept(
+      std::is_constructible<Value, Args&&...>::value) {
+    construct(std::forward<Args>(args)...);
+  }
+
+  void require_value() const {
+    if (!storage_.hasValue) {
+      throw OptionalEmptyException{};
+    }
+  }
+
+  template <class... Args>
+  void construct(Args&&... args) {
+    const void* ptr = &storage_.value;
+    // For supporting const types.
+    new (const_cast<void*>(ptr)) Value(std::forward<Args>(args)...);
+    storage_.hasValue = true;
+  }
+
+  struct StorageTriviallyDestructible {
+    union {
+      char emptyState;
+      Value value;
+    };
+    bool hasValue;
+
+    StorageTriviallyDestructible()
+        : emptyState('\0'), hasValue{false} {}
+    void clear() {
+      hasValue = false;
+    }
+  };
+
+  struct StorageNonTriviallyDestructible {
+    union {
+      char emptyState;
+      Value value;
+    };
+    bool hasValue;
+
+    StorageNonTriviallyDestructible() : hasValue{false} {}
+    ~StorageNonTriviallyDestructible() {
+      clear();
+    }
+
+    void clear() {
+      if (hasValue) {
+        hasValue = false;
+        value.~Value();
+      }
+    }
+  };
+
+  using Storage = typename std::conditional<
+      std::is_trivially_destructible<Value>::value,
+      StorageTriviallyDestructible,
+      StorageNonTriviallyDestructible>::type;
+
+  Storage storage_;
+};
+
+template <class T>
+const T* get_pointer(const Optional<T>& opt) {
+  return opt.get_pointer();
+}
+
+template <class T>
+T* get_pointer(Optional<T>& opt) {
+  return opt.get_pointer();
+}
+
+template <class T>
+void swap(Optional<T>& a, Optional<T>& b) noexcept(noexcept(a.swap(b))) {
+  a.swap(b);
+}
+
+template <class T>
+Optional<_t<std::decay<T>>> make_optional(T&& v) {
+  using PrivateConstructor =
+      typename folly::Optional<_t<std::decay<T>>>::PrivateConstructor;
+  return {PrivateConstructor{}, std::forward<T>(v)};
+}
+
+template <class T, class... Args>
+folly::Optional<T> make_optional(Args&&... args) {
+  using PrivateConstructor = typename folly::Optional<T>::PrivateConstructor;
+  return {PrivateConstructor{}, std::forward<Args>(args)...};
+}
+
+template <class T, class U, class... Args>
+folly::Optional<T> make_optional(
+    std::initializer_list<U> il,
+    Args&&... args) {
+  using PrivateConstructor = typename folly::Optional<T>::PrivateConstructor;
+  return {PrivateConstructor{}, il, std::forward<Args>(args)...};
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Comparisons.
+
+template <class U, class V>
+bool operator==(const Optional<U>& a, const V& b) {
+  return a.hasValue() && a.value() == b;
+}
+
+template <class U, class V>
+bool operator!=(const Optional<U>& a, const V& b) {
+  return !(a == b);
+}
+
+template <class U, class V>
+bool operator==(const U& a, const Optional<V>& b) {
+  return b.hasValue() && b.value() == a;
+}
+
+template <class U, class V>
+bool operator!=(const U& a, const Optional<V>& b) {
+  return !(a == b);
+}
+
+template <class U, class V>
+bool operator==(const Optional<U>& a, const Optional<V>& b) {
+  if (a.hasValue() != b.hasValue()) {
+    return false;
+  }
+  if (a.hasValue()) {
+    return a.value() == b.value();
+  }
+  return true;
+}
+
+template <class U, class V>
+bool operator!=(const Optional<U>& a, const Optional<V>& b) {
+  return !(a == b);
+}
+
+template <class U, class V>
+bool operator<(const Optional<U>& a, const Optional<V>& b) {
+  if (a.hasValue() != b.hasValue()) {
+    return a.hasValue() < b.hasValue();
+  }
+  if (a.hasValue()) {
+    return a.value() < b.value();
+  }
+  return false;
+}
+
+template <class U, class V>
+bool operator>(const Optional<U>& a, const Optional<V>& b) {
+  return b < a;
+}
+
+template <class U, class V>
+bool operator<=(const Optional<U>& a, const Optional<V>& b) {
+  return !(b < a);
+}
+
+template <class U, class V>
+bool operator>=(const Optional<U>& a, const Optional<V>& b) {
+  return !(a < b);
+}
+
+// Suppress comparability of Optional<T> with T, despite implicit conversion.
+template <class V>
+bool operator<(const Optional<V>&, const V& other) = delete;
+template <class V>
+bool operator<=(const Optional<V>&, const V& other) = delete;
+template <class V>
+bool operator>=(const Optional<V>&, const V& other) = delete;
+template <class V>
+bool operator>(const Optional<V>&, const V& other) = delete;
+template <class V>
+bool operator<(const V& other, const Optional<V>&) = delete;
+template <class V>
+bool operator<=(const V& other, const Optional<V>&) = delete;
+template <class V>
+bool operator>=(const V& other, const Optional<V>&) = delete;
+template <class V>
+bool operator>(const V& other, const Optional<V>&) = delete;
+
+// Comparisons with none
+template <class V>
+bool operator==(const Optional<V>& a, None) noexcept {
+  return !a.hasValue();
+}
+template <class V>
+bool operator==(None, const Optional<V>& a) noexcept {
+  return !a.hasValue();
+}
+template <class V>
+bool operator<(const Optional<V>&, None) noexcept {
+  return false;
+}
+template <class V>
+bool operator<(None, const Optional<V>& a) noexcept {
+  return a.hasValue();
+}
+template <class V>
+bool operator>(const Optional<V>& a, None) noexcept {
+  return a.hasValue();
+}
+template <class V>
+bool operator>(None, const Optional<V>&) noexcept {
+  return false;
+}
+template <class V>
+bool operator<=(None, const Optional<V>&) noexcept {
+  return true;
+}
+template <class V>
+bool operator<=(const Optional<V>& a, None) noexcept {
+  return !a.hasValue();
+}
+template <class V>
+bool operator>=(const Optional<V>&, None) noexcept {
+  return true;
+}
+template <class V>
+bool operator>=(None, const Optional<V>& a) noexcept {
+  return !a.hasValue();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace folly
diff --git a/third-party/folly/folly/Portability.h b/third-party/folly/folly/Portability.h
new file mode 100644
index 00000000000..61c05ff2254
--- /dev/null
+++ b/third-party/folly/folly/Portability.h
@@ -0,0 +1,84 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/CPortability.h>
+
+#if defined(__arm__)
+#define FOLLY_ARM 1
+#else
+#define FOLLY_ARM 0
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define FOLLY_X64 1
+#else
+#define FOLLY_X64 0
+#endif
+
+#if defined(__aarch64__)
+#define FOLLY_AARCH64 1
+#else
+#define FOLLY_AARCH64 0
+#endif
+
+#if defined(__powerpc64__)
+#define FOLLY_PPC64 1
+#else
+#define FOLLY_PPC64 0
+#endif
+
+#if defined(__has_builtin)
+#define FOLLY_HAS_BUILTIN(...) __has_builtin(__VA_ARGS__)
+#else
+#define FOLLY_HAS_BUILTIN(...) 0
+#endif
+
+#if defined(__has_cpp_attribute)
+#if __has_cpp_attribute(nodiscard)
+#define FOLLY_NODISCARD [[nodiscard]]
+#endif
+#endif
+#if !defined FOLLY_NODISCARD
+#if defined(_MSC_VER) && (_MSC_VER >= 1700)
+#define FOLLY_NODISCARD _Check_return_
+#elif defined(__GNUC__)
+#define FOLLY_NODISCARD __attribute__((__warn_unused_result__))
+#else
+#define FOLLY_NODISCARD
+#endif
+#endif
+
+namespace folly {
+constexpr bool kIsArchArm = FOLLY_ARM == 1;
+constexpr bool kIsArchAmd64 = FOLLY_X64 == 1;
+constexpr bool kIsArchAArch64 = FOLLY_AARCH64 == 1;
+constexpr bool kIsArchPPC64 = FOLLY_PPC64 == 1;
+} // namespace folly
+
+namespace folly {
+#ifdef NDEBUG
+constexpr auto kIsDebug = false;
+#else
+constexpr auto kIsDebug = true;
+#endif
+} // namespace folly
+
+namespace folly {
+#if defined(_MSC_VER)
+constexpr bool kIsMsvc = true;
+#else
+constexpr bool kIsMsvc = false;
+#endif
+} // namespace folly
+
+namespace folly {
+#if FOLLY_SANITIZE_THREAD
+constexpr bool kIsSanitizeThread = true;
+#else
+constexpr bool kIsSanitizeThread = false;
+#endif
+} // namespace folly
diff --git a/third-party/folly/folly/ScopeGuard.h b/third-party/folly/folly/ScopeGuard.h
new file mode 100644
index 00000000000..71134406303
--- /dev/null
+++ b/third-party/folly/folly/ScopeGuard.h
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Traits.h>
+
+#include <utility>
+#include <type_traits>
+
+namespace folly {
+namespace scope_guard_detail {
+template <typename F>
+class ScopeGuardImpl {
+ public:
+  explicit ScopeGuardImpl(F&& f) : f_{std::forward<F>(f)} {}
+  ~ScopeGuardImpl() {
+    f_();
+  }
+
+ private:
+  F f_;
+};
+
+enum class ScopeGuardEnum {};
+template <typename Func, typename DecayedFunc = _t<std::decay<Func>>>
+ScopeGuardImpl<DecayedFunc> operator+(ScopeGuardEnum, Func&& func) {
+  return ScopeGuardImpl<DecayedFunc>{std::forward<Func>(func)};
+}
+} // namespace scope_guard_detail
+} // namespace folly
+
+/**
+ * FB_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
+ * str and ending with a number that varies with the line.
+ */
+#ifndef FB_ANONYMOUS_VARIABLE
+#define FB_CONCATENATE_IMPL(s1, s2) s1##s2
+#define FB_CONCATENATE(s1, s2) FB_CONCATENATE_IMPL(s1, s2)
+#ifdef __COUNTER__
+#define FB_ANONYMOUS_VARIABLE(str) \
+  FB_CONCATENATE(FB_CONCATENATE(FB_CONCATENATE(str, __COUNTER__), _), __LINE__)
+#else
+#define FB_ANONYMOUS_VARIABLE(str) FB_CONCATENATE(str, __LINE__)
+#endif
+#endif
+
+#ifndef SCOPE_EXIT
+#define SCOPE_EXIT                                    \
+    auto FB_ANONYMOUS_VARIABLE(SCOPE_EXIT_STATE) =    \
+        ::folly::scope_guard_detail::ScopeGuardEnum{} + [&]() noexcept
+#endif
diff --git a/third-party/folly/folly/Traits.h b/third-party/folly/folly/Traits.h
new file mode 100644
index 00000000000..ea7e1eb1c05
--- /dev/null
+++ b/third-party/folly/folly/Traits.h
@@ -0,0 +1,152 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+namespace folly {
+
+#if !defined(_MSC_VER)
+template <class T>
+struct is_trivially_copyable
+    : std::integral_constant<bool, __has_trivial_copy(T)> {};
+#else
+template <class T>
+using is_trivially_copyable = std::is_trivially_copyable<T>;
+#endif
+
+/***
+ *  _t
+ *
+ *  Instead of:
+ *
+ *    using decayed = typename std::decay<T>::type;
+ *
+ *  With the C++14 standard trait aliases, we could use:
+ *
+ *    using decayed = std::decay_t<T>;
+ *
+ *  Without them, we could use:
+ *
+ *    using decayed = _t<std::decay<T>>;
+ *
+ *  Also useful for any other library with template types having dependent
+ *  member types named `type`, like the standard trait types.
+ */
+template <typename T>
+using _t = typename T::type;
+
+/**
+ *  type_t
+ *
+ *  A type alias for the first template type argument. `type_t` is useful for
+ *  controlling class-template and function-template partial specialization.
+ *
+ *  Example:
+ *
+ *    template <typename Value>
+ *    class Container {
+ *     public:
+ *      template <typename... Args>
+ *      Container(
+ *          type_t<in_place_t, decltype(Value(std::declval<Args>()...))>,
+ *          Args&&...);
+ *    };
+ *
+ *  void_t
+ *
+ *  A type alias for `void`. `void_t` is useful for controling class-template
+ *  and function-template partial specialization.
+ *
+ *  Example:
+ *
+ *    // has_value_type<T>::value is true if T has a nested type `value_type`
+ *    template <class T, class = void>
+ *    struct has_value_type
+ *        : std::false_type {};
+ *
+ *    template <class T>
+ *    struct has_value_type<T, folly::void_t<typename T::value_type>>
+ *        : std::true_type {};
+ */
+
+/**
+ * There is a bug in libstdc++, libc++, and MSVC's STL that causes it to
+ * ignore unused template parameter arguments in template aliases and does not
+ * cause substitution failures. This defect has been recorded here:
+ * http://open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#1558.
+ *
+ * This causes the implementation of std::void_t to be buggy, as it is likely
+ * defined as something like the following:
+ *
+ *  template <typename...>
+ *  using void_t = void;
+ *
+ * This causes the compiler to ignore all the template arguments and does not
+ * help when one wants to cause substitution failures.  Rather declarations
+ * which have void_t in orthogonal specializations are treated as the same.
+ * For example, assuming the possible `T` types are only allowed to have
+ * either the alias `one` or `two` and never both or none:
+ *
+ *  template <typename T,
+ *            typename std::void_t<std::decay_t<T>::one>* = nullptr>
+ *  void foo(T&&) {}
+ *  template <typename T,
+ *            typename std::void_t<std::decay_t<T>::two>* = nullptr>
+ *  void foo(T&&) {}
+ *
+ * The second foo() will be a redefinition because it conflicts with the first
+ * one; void_t does not cause substitution failures - the template types are
+ * just ignored.
+ */
+
+namespace traits_detail {
+template <class T, class...>
+struct type_t_ {
+  using type = T;
+};
+} // namespace traits_detail
+
+template <class T, class... Ts>
+using type_t = typename traits_detail::type_t_<T, Ts...>::type;
+template <class... Ts>
+using void_t = type_t<void, Ts...>;
+
+/**
+ * A type trait to remove all const volatile and reference qualifiers on a
+ * type T
+ */
+template <typename T>
+struct remove_cvref {
+  using type =
+      typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+};
+template <typename T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+
+template <class T>
+struct IsNothrowSwappable
+    : std::integral_constant<
+          bool,
+          std::is_nothrow_move_constructible<T>::value&& noexcept(
+              std::swap(std::declval<T&>(), std::declval<T&>()))> {};
+
+template <typename...>
+struct Conjunction : std::true_type {};
+template <typename T>
+struct Conjunction<T> : T {};
+template <typename T, typename... TList>
+struct Conjunction<T, TList...>
+    : std::conditional<T::value, Conjunction<TList...>, T>::type {};
+
+template <typename T>
+struct Negation : std::integral_constant<bool, !T::value> {};
+
+template <std::size_t I>
+using index_constant = std::integral_constant<std::size_t, I>;
+
+} // namespace folly
diff --git a/third-party/folly/folly/Unit.h b/third-party/folly/folly/Unit.h
new file mode 100644
index 00000000000..c8cb77e2c37
--- /dev/null
+++ b/third-party/folly/folly/Unit.h
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <type_traits>
+
+namespace folly {
+
+/// In functional programming, the degenerate case is often called "unit". In
+/// C++, "void" is often the best analogue. However, because of the syntactic
+/// special-casing required for void, it is frequently a liability for template
+/// metaprogramming. So, instead of writing specializations to handle cases like
+/// SomeContainer<void>, a library author may instead rule that out and simply
+/// have library users use SomeContainer<Unit>. Contained values may be ignored.
+/// Much easier.
+///
+/// "void" is the type that admits of no values at all. It is not possible to
+/// construct a value of this type.
+/// "unit" is the type that admits of precisely one unique value. It is
+/// possible to construct a value of this type, but it is always the same value
+/// every time, so it is uninteresting.
+struct Unit {
+  constexpr bool operator==(const Unit& /*other*/) const {
+    return true;
+  }
+  constexpr bool operator!=(const Unit& /*other*/) const {
+    return false;
+  }
+};
+
+constexpr Unit unit{};
+
+template <typename T>
+struct lift_unit {
+  using type = T;
+};
+template <>
+struct lift_unit<void> {
+  using type = Unit;
+};
+template <typename T>
+using lift_unit_t = typename lift_unit<T>::type;
+
+template <typename T>
+struct drop_unit {
+  using type = T;
+};
+template <>
+struct drop_unit<Unit> {
+  using type = void;
+};
+template <typename T>
+using drop_unit_t = typename drop_unit<T>::type;
+
+} // namespace folly
+
diff --git a/third-party/folly/folly/Utility.h b/third-party/folly/folly/Utility.h
new file mode 100644
index 00000000000..7e43bdc2f17
--- /dev/null
+++ b/third-party/folly/folly/Utility.h
@@ -0,0 +1,141 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <utility>
+#include <type_traits>
+
+namespace folly {
+
+/**
+ *  Backports from C++17 of:
+ *    std::in_place_t
+ *    std::in_place_type_t
+ *    std::in_place_index_t
+ *    std::in_place
+ *    std::in_place_type
+ *    std::in_place_index
+ */
+
+struct in_place_tag {};
+template <class>
+struct in_place_type_tag {};
+template <std::size_t>
+struct in_place_index_tag {};
+
+using in_place_t = in_place_tag (&)(in_place_tag);
+template <class T>
+using in_place_type_t = in_place_type_tag<T> (&)(in_place_type_tag<T>);
+template <std::size_t I>
+using in_place_index_t = in_place_index_tag<I> (&)(in_place_index_tag<I>);
+
+inline in_place_tag in_place(in_place_tag = {}) {
+  return {};
+}
+template <class T>
+inline in_place_type_tag<T> in_place_type(in_place_type_tag<T> = {}) {
+  return {};
+}
+template <std::size_t I>
+inline in_place_index_tag<I> in_place_index(in_place_index_tag<I> = {}) {
+  return {};
+}
+
+template <class T, class U = T>
+T exchange(T& obj, U&& new_value) {
+  T old_value = std::move(obj);
+  obj = std::forward<U>(new_value);
+  return old_value;
+}
+
+namespace utility_detail {
+template <typename...>
+struct make_seq_cat;
+template <
+    template <typename T, T...> class S,
+    typename T,
+    T... Ta,
+    T... Tb,
+    T... Tc>
+struct make_seq_cat<S<T, Ta...>, S<T, Tb...>, S<T, Tc...>> {
+  using type =
+      S<T,
+        Ta...,
+        (sizeof...(Ta) + Tb)...,
+        (sizeof...(Ta) + sizeof...(Tb) + Tc)...>;
+};
+
+// Not parameterizing by `template <typename T, T...> class, typename` because
+// clang precisely v4.0 fails to compile that. Note that clang v3.9 and v5.0
+// handle that code correctly.
+//
+// For this to work, `S0` is required to be `Sequence<T>` and `S1` is required
+// to be `Sequence<T, 0>`.
+
+template <std::size_t Size>
+struct make_seq {
+  template <typename S0, typename S1>
+  using apply = typename make_seq_cat<
+      typename make_seq<Size / 2>::template apply<S0, S1>,
+      typename make_seq<Size / 2>::template apply<S0, S1>,
+      typename make_seq<Size % 2>::template apply<S0, S1>>::type;
+};
+template <>
+struct make_seq<1> {
+  template <typename S0, typename S1>
+  using apply = S1;
+};
+template <>
+struct make_seq<0> {
+  template <typename S0, typename S1>
+  using apply = S0;
+};
+} // namespace utility_detail
+
+// TODO: Remove after upgrading to C++14 baseline
+
+template <class T, T... Ints>
+struct integer_sequence {
+  using value_type = T;
+
+  static constexpr std::size_t size() noexcept {
+    return sizeof...(Ints);
+  }
+};
+
+template <std::size_t... Ints>
+using index_sequence = integer_sequence<std::size_t, Ints...>;
+
+template <typename T, std::size_t Size>
+using make_integer_sequence = typename utility_detail::make_seq<
+    Size>::template apply<integer_sequence<T>, integer_sequence<T, 0>>;
+
+template <std::size_t Size>
+using make_index_sequence = make_integer_sequence<std::size_t, Size>;
+template <class... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+/**
+ * A simple helper for getting a constant reference to an object.
+ *
+ * Example:
+ *
+ *   std::vector<int> v{1,2,3};
+ *   // The following two lines are equivalent:
+ *   auto a = const_cast<const std::vector<int>&>(v).begin();
+ *   auto b = folly::as_const(v).begin();
+ *
+ * Like C++17's std::as_const. See http://wg21.link/p0007
+ */
+template <class T>
+T const& as_const(T& t) noexcept {
+  return t;
+}
+
+template <class T>
+void as_const(T const&&) = delete;
+
+} // namespace folly
diff --git a/third-party/folly/folly/chrono/Hardware.h b/third-party/folly/folly/chrono/Hardware.h
new file mode 100644
index 00000000000..ec7be82e8be
--- /dev/null
+++ b/third-party/folly/folly/chrono/Hardware.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Portability.h>
+
+#include <chrono>
+#include <cstdint>
+
+#if _MSC_VER
+extern "C" std::uint64_t __rdtsc();
+#pragma intrinsic(__rdtsc)
+#endif
+
+namespace folly {
+
+inline std::uint64_t hardware_timestamp() {
+#if _MSC_VER
+  return __rdtsc();
+#elif __GNUC__ && (__i386__ || FOLLY_X64)
+  return __builtin_ia32_rdtsc();
+#else
+  // use steady_clock::now() as an approximation for the timestamp counter on
+  // non-x86 systems
+  return std::chrono::steady_clock::now().time_since_epoch().count();
+#endif
+}
+
+} // namespace folly
+
diff --git a/third-party/folly/folly/container/Array.h b/third-party/folly/folly/container/Array.h
new file mode 100644
index 00000000000..bb3167b9793
--- /dev/null
+++ b/third-party/folly/folly/container/Array.h
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <type_traits>
+#include <utility>
+
+#include <folly/Traits.h>
+#include <folly/Utility.h>
+
+namespace folly {
+
+namespace array_detail {
+template <typename>
+struct is_ref_wrapper : std::false_type {};
+template <typename T>
+struct is_ref_wrapper<std::reference_wrapper<T>> : std::true_type {};
+
+template <typename T>
+using not_ref_wrapper =
+    folly::Negation<is_ref_wrapper<typename std::decay<T>::type>>;
+
+template <typename D, typename...>
+struct return_type_helper {
+  using type = D;
+};
+template <typename... TList>
+struct return_type_helper<void, TList...> {
+  static_assert(
+      folly::Conjunction<not_ref_wrapper<TList>...>::value,
+      "TList cannot contain reference_wrappers when D is void");
+  using type = typename std::common_type<TList...>::type;
+};
+
+template <typename D, typename... TList>
+using return_type = std::
+    array<typename return_type_helper<D, TList...>::type, sizeof...(TList)>;
+} // namespace array_detail
+
+template <typename D = void, typename... TList>
+constexpr array_detail::return_type<D, TList...> make_array(TList&&... t) {
+  using value_type =
+      typename array_detail::return_type_helper<D, TList...>::type;
+  return {{static_cast<value_type>(std::forward<TList>(t))...}};
+}
+
+namespace array_detail {
+template <typename MakeItem, std::size_t... Index>
+inline constexpr auto make_array_with(
+    MakeItem const& make,
+    folly::index_sequence<Index...>)
+      -> std::array<decltype(make(0)), sizeof...(Index)> {
+  return std::array<decltype(make(0)), sizeof...(Index)>{{make(Index)...}};
+}
+} // namespace array_detail
+
+//  make_array_with
+//
+//  Constructs a std::array<..., Size> with elements m(i) for i in [0, Size).
+template <std::size_t Size, typename MakeItem>
+constexpr auto make_array_with(MakeItem const& make)
+    -> decltype(array_detail::make_array_with(
+          make,
+          folly::make_index_sequence<Size>{})) {
+  return array_detail::make_array_with(
+      make,
+      folly::make_index_sequence<Size>{});
+}
+
+} // namespace folly
diff --git a/third-party/folly/folly/detail/Futex-inl.h b/third-party/folly/folly/detail/Futex-inl.h
new file mode 100644
index 00000000000..3b2a412bfb6
--- /dev/null
+++ b/third-party/folly/folly/detail/Futex-inl.h
@@ -0,0 +1,117 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/detail/Futex.h>
+#include <folly/synchronization/ParkingLot.h>
+
+namespace folly {
+namespace detail {
+
+/** Optimal when TargetClock is the same type as Clock.
+ *
+ *  Otherwise, both Clock::now() and TargetClock::now() must be invoked. */
+template <typename TargetClock, typename Clock, typename Duration>
+typename TargetClock::time_point time_point_conv(
+    std::chrono::time_point<Clock, Duration> const& time) {
+  using std::chrono::duration_cast;
+  using TimePoint = std::chrono::time_point<Clock, Duration>;
+  using TargetDuration = typename TargetClock::duration;
+  using TargetTimePoint = typename TargetClock::time_point;
+  if (time == TimePoint::max()) {
+    return TargetTimePoint::max();
+  } else if (std::is_same<Clock, TargetClock>::value) {
+    // in place of time_point_cast, which cannot compile without if-constexpr
+    auto const delta = time.time_since_epoch();
+    return TargetTimePoint(duration_cast<TargetDuration>(delta));
+  } else {
+    // different clocks with different epochs, so non-optimal case
+    auto const delta = time - Clock::now();
+    return TargetClock::now() + duration_cast<TargetDuration>(delta);
+  }
+}
+
+/**
+ * Available overloads, with definitions elsewhere
+ *
+ * These functions are treated as ADL-extension points, the templates above
+ * call these functions without them having being pre-declared.  This works
+ * because ADL lookup finds the definitions of these functions when you pass
+ * the relevant arguments
+ */
+int futexWakeImpl(
+    const Futex<std::atomic>* futex,
+    int count,
+    uint32_t wakeMask);
+FutexResult futexWaitImpl(
+    const Futex<std::atomic>* futex,
+    uint32_t expected,
+    std::chrono::system_clock::time_point const* absSystemTime,
+    std::chrono::steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask);
+
+int futexWakeImpl(
+    const Futex<EmulatedFutexAtomic>* futex,
+    int count,
+    uint32_t wakeMask);
+FutexResult futexWaitImpl(
+    const Futex<EmulatedFutexAtomic>* futex,
+    uint32_t expected,
+    std::chrono::system_clock::time_point const* absSystemTime,
+    std::chrono::steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask);
+
+template <typename Futex, typename Deadline>
+typename std::enable_if<Deadline::clock::is_steady, FutexResult>::type
+futexWaitImpl(
+    Futex* futex,
+    uint32_t expected,
+    Deadline const& deadline,
+    uint32_t waitMask) {
+  return futexWaitImpl(futex, expected, nullptr, &deadline, waitMask);
+}
+
+template <typename Futex, typename Deadline>
+typename std::enable_if<!Deadline::clock::is_steady, FutexResult>::type
+futexWaitImpl(
+    Futex* futex,
+    uint32_t expected,
+    Deadline const& deadline,
+    uint32_t waitMask) {
+  return futexWaitImpl(futex, expected, &deadline, nullptr, waitMask);
+}
+
+template <typename Futex>
+FutexResult
+futexWait(const Futex* futex, uint32_t expected, uint32_t waitMask) {
+  auto rv = futexWaitImpl(futex, expected, nullptr, nullptr, waitMask);
+  assert(rv != FutexResult::TIMEDOUT);
+  return rv;
+}
+
+template <typename Futex>
+int futexWake(const Futex* futex, int count, uint32_t wakeMask) {
+  return futexWakeImpl(futex, count, wakeMask);
+}
+
+template <typename Futex, class Clock, class Duration>
+FutexResult futexWaitUntil(
+    const Futex* futex,
+    uint32_t expected,
+    std::chrono::time_point<Clock, Duration> const& deadline,
+    uint32_t waitMask) {
+  using Target = typename std::conditional<
+      Clock::is_steady,
+      std::chrono::steady_clock,
+      std::chrono::system_clock>::type;
+  auto const converted = time_point_conv<Target>(deadline);
+  return converted == Target::time_point::max()
+      ? futexWaitImpl(futex, expected, nullptr, nullptr, waitMask)
+      : futexWaitImpl(futex, expected, converted, waitMask);
+}
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/detail/Futex.cpp b/third-party/folly/folly/detail/Futex.cpp
new file mode 100644
index 00000000000..62d6ea2b201
--- /dev/null
+++ b/third-party/folly/folly/detail/Futex.cpp
@@ -0,0 +1,263 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/detail/Futex.h>
+#include <folly/portability/SysSyscall.h>
+#include <stdint.h>
+#include <string.h>
+#include <array>
+#include <cerrno>
+
+#include <folly/synchronization/ParkingLot.h>
+
+#ifdef __linux__
+#include <linux/futex.h>
+#endif
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+using namespace std::chrono;
+
+namespace folly {
+namespace detail {
+
+namespace {
+
+////////////////////////////////////////////////////
+// native implementation using the futex() syscall
+
+#ifdef __linux__
+
+/// Certain toolchains (like Android's) don't include the full futex API in
+/// their headers even though they support it. Make sure we have our constants
+/// even if the headers don't have them.
+#ifndef FUTEX_WAIT_BITSET
+#define FUTEX_WAIT_BITSET 9
+#endif
+#ifndef FUTEX_WAKE_BITSET
+#define FUTEX_WAKE_BITSET 10
+#endif
+#ifndef FUTEX_PRIVATE_FLAG
+#define FUTEX_PRIVATE_FLAG 128
+#endif
+#ifndef FUTEX_CLOCK_REALTIME
+#define FUTEX_CLOCK_REALTIME 256
+#endif
+
+int nativeFutexWake(const void* addr, int count, uint32_t wakeMask) {
+  long rv = syscall(
+      __NR_futex,
+      addr, /* addr1 */
+      FUTEX_WAKE_BITSET | FUTEX_PRIVATE_FLAG, /* op */
+      count, /* val */
+      nullptr, /* timeout */
+      nullptr, /* addr2 */
+      wakeMask); /* val3 */
+
+  /* NOTE: we ignore errors on wake for the case of a futex
+     guarding its own destruction, similar to this
+     glibc bug with sem_post/sem_wait:
+     https://sourceware.org/bugzilla/show_bug.cgi?id=12674 */
+  if (rv < 0) {
+    return 0;
+  }
+  return static_cast<int>(rv);
+}
+
+template <class Clock>
+struct timespec timeSpecFromTimePoint(time_point<Clock> absTime) {
+  auto epoch = absTime.time_since_epoch();
+  if (epoch.count() < 0) {
+    // kernel timespec_valid requires non-negative seconds and nanos in [0,1G)
+    epoch = Clock::duration::zero();
+  }
+
+  // timespec-safe seconds and nanoseconds;
+  // chrono::{nano,}seconds are `long long int`
+  // whereas timespec uses smaller types
+  using time_t_seconds = duration<std::time_t, seconds::period>;
+  using long_nanos = duration<long int, nanoseconds::period>;
+
+  auto secs = duration_cast<time_t_seconds>(epoch);
+  auto nanos = duration_cast<long_nanos>(epoch - secs);
+  struct timespec result = {secs.count(), nanos.count()};
+  return result;
+}
+
+FutexResult nativeFutexWaitImpl(
+    const void* addr,
+    uint32_t expected,
+    system_clock::time_point const* absSystemTime,
+    steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask) {
+  assert(absSystemTime == nullptr || absSteadyTime == nullptr);
+
+  int op = FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG;
+  struct timespec ts;
+  struct timespec* timeout = nullptr;
+
+  if (absSystemTime != nullptr) {
+    op |= FUTEX_CLOCK_REALTIME;
+    ts = timeSpecFromTimePoint(*absSystemTime);
+    timeout = &ts;
+  } else if (absSteadyTime != nullptr) {
+    ts = timeSpecFromTimePoint(*absSteadyTime);
+    timeout = &ts;
+  }
+
+  // Unlike FUTEX_WAIT, FUTEX_WAIT_BITSET requires an absolute timeout
+  // value - http://locklessinc.com/articles/futex_cheat_sheet/
+  long rv = syscall(
+      __NR_futex,
+      addr, /* addr1 */
+      op, /* op */
+      expected, /* val */
+      timeout, /* timeout */
+      nullptr, /* addr2 */
+      waitMask); /* val3 */
+
+  if (rv == 0) {
+    return FutexResult::AWOKEN;
+  } else {
+    switch (errno) {
+      case ETIMEDOUT:
+        assert(timeout != nullptr);
+        return FutexResult::TIMEDOUT;
+      case EINTR:
+        return FutexResult::INTERRUPTED;
+      case EWOULDBLOCK:
+        return FutexResult::VALUE_CHANGED;
+      default:
+        assert(false);
+        // EINVAL, EACCESS, or EFAULT.  EINVAL means there was an invalid
+        // op (should be impossible) or an invalid timeout (should have
+        // been sanitized by timeSpecFromTimePoint).  EACCESS or EFAULT
+        // means *addr points to invalid memory, which is unlikely because
+        // the caller should have segfaulted already.  We can either
+        // crash, or return a value that lets the process continue for
+        // a bit. We choose the latter. VALUE_CHANGED probably turns the
+        // caller into a spin lock.
+        return FutexResult::VALUE_CHANGED;
+    }
+  }
+}
+
+#endif // __linux__
+
+///////////////////////////////////////////////////////
+// compatibility implementation using standard C++ API
+
+using Lot = ParkingLot<uint32_t>;
+Lot parkingLot;
+
+int emulatedFutexWake(const void* addr, int count, uint32_t waitMask) {
+  int woken = 0;
+  parkingLot.unpark(addr, [&](const uint32_t& mask) {
+    if ((mask & waitMask) == 0) {
+      return UnparkControl::RetainContinue;
+    }
+    assert(count > 0);
+    count--;
+    woken++;
+    return count > 0 ? UnparkControl::RemoveContinue
+                     : UnparkControl::RemoveBreak;
+  });
+  return woken;
+}
+
+template <typename F>
+FutexResult emulatedFutexWaitImpl(
+    F* futex,
+    uint32_t expected,
+    system_clock::time_point const* absSystemTime,
+    steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask) {
+  static_assert(
+      std::is_same<F, const Futex<std::atomic>>::value ||
+          std::is_same<F, const Futex<EmulatedFutexAtomic>>::value,
+      "Type F must be either Futex<std::atomic> or Futex<EmulatedFutexAtomic>");
+  ParkResult res;
+  if (absSystemTime) {
+    res = parkingLot.park_until(
+        futex,
+        waitMask,
+        [&] { return *futex == expected; },
+        [] {},
+        *absSystemTime);
+  } else if (absSteadyTime) {
+    res = parkingLot.park_until(
+        futex,
+        waitMask,
+        [&] { return *futex == expected; },
+        [] {},
+        *absSteadyTime);
+  } else {
+    res = parkingLot.park(
+        futex, waitMask, [&] { return *futex == expected; }, [] {});
+  }
+  switch (res) {
+    case ParkResult::Skip:
+      return FutexResult::VALUE_CHANGED;
+    case ParkResult::Unpark:
+      return FutexResult::AWOKEN;
+    case ParkResult::Timeout:
+      return FutexResult::TIMEDOUT;
+  }
+
+  return FutexResult::INTERRUPTED;
+}
+
+} // namespace
+
+/////////////////////////////////
+// Futex<> overloads
+
+int futexWakeImpl(
+    const Futex<std::atomic>* futex,
+    int count,
+    uint32_t wakeMask) {
+#ifdef __linux__
+  return nativeFutexWake(futex, count, wakeMask);
+#else
+  return emulatedFutexWake(futex, count, wakeMask);
+#endif
+}
+
+int futexWakeImpl(
+    const Futex<EmulatedFutexAtomic>* futex,
+    int count,
+    uint32_t wakeMask) {
+  return emulatedFutexWake(futex, count, wakeMask);
+}
+
+FutexResult futexWaitImpl(
+    const Futex<std::atomic>* futex,
+    uint32_t expected,
+    system_clock::time_point const* absSystemTime,
+    steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask) {
+#ifdef __linux__
+  return nativeFutexWaitImpl(
+      futex, expected, absSystemTime, absSteadyTime, waitMask);
+#else
+  return emulatedFutexWaitImpl(
+      futex, expected, absSystemTime, absSteadyTime, waitMask);
+#endif
+}
+
+FutexResult futexWaitImpl(
+    const Futex<EmulatedFutexAtomic>* futex,
+    uint32_t expected,
+    system_clock::time_point const* absSystemTime,
+    steady_clock::time_point const* absSteadyTime,
+    uint32_t waitMask) {
+  return emulatedFutexWaitImpl(
+      futex, expected, absSystemTime, absSteadyTime, waitMask);
+}
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/detail/Futex.h b/third-party/folly/folly/detail/Futex.h
new file mode 100644
index 00000000000..987a1b89574
--- /dev/null
+++ b/third-party/folly/folly/detail/Futex.h
@@ -0,0 +1,96 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace folly {
+namespace detail {
+
+enum class FutexResult {
+  VALUE_CHANGED, /* futex value didn't match expected */
+  AWOKEN, /* wakeup by matching futex wake, or spurious wakeup */
+  INTERRUPTED, /* wakeup by interrupting signal */
+  TIMEDOUT, /* wakeup by expiring deadline */
+};
+
+/**
+ * Futex is an atomic 32 bit unsigned integer that provides access to the
+ * futex() syscall on that value.  It is templated in such a way that it
+ * can interact properly with DeterministicSchedule testing.
+ *
+ * If you don't know how to use futex(), you probably shouldn't be using
+ * this class.  Even if you do know how, you should have a good reason
+ * (and benchmarks to back you up).
+ *
+ * Because of the semantics of the futex syscall, the futex family of
+ * functions are available as free functions rather than member functions
+ */
+template <template <typename> class Atom = std::atomic>
+using Futex = Atom<std::uint32_t>;
+
+/**
+ * Puts the thread to sleep if this->load() == expected.  Returns true when
+ * it is returning because it has consumed a wake() event, false for any
+ * other return (signal, this->load() != expected, or spurious wakeup).
+ */
+template <typename Futex>
+FutexResult
+futexWait(const Futex* futex, uint32_t expected, uint32_t waitMask = -1);
+
+/**
+ * Similar to futexWait but also accepts a deadline until when the wait call
+ * may block.
+ *
+ * Optimal clock types: std::chrono::system_clock, std::chrono::steady_clock.
+ * NOTE: On some systems steady_clock is just an alias for system_clock,
+ * and is not actually steady.
+ *
+ * For any other clock type, now() will be invoked twice.
+ */
+template <typename Futex, class Clock, class Duration>
+FutexResult futexWaitUntil(
+    const Futex* futex,
+    uint32_t expected,
+    std::chrono::time_point<Clock, Duration> const& deadline,
+    uint32_t waitMask = -1);
+
+/**
+ * Wakes up to count waiters where (waitMask & wakeMask) != 0, returning the
+ * number of awoken threads, or -1 if an error occurred.  Note that when
+ * constructing a concurrency primitive that can guard its own destruction, it
+ * is likely that you will want to ignore EINVAL here (as well as making sure
+ * that you never touch the object after performing the memory store that is
+ * the linearization point for unlock or control handoff).  See
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=13690
+ */
+template <typename Futex>
+int futexWake(
+    const Futex* futex,
+    int count = std::numeric_limits<int>::max(),
+    uint32_t wakeMask = -1);
+
+/** A std::atomic subclass that can be used to force Futex to emulate
+ *  the underlying futex() syscall.  This is primarily useful to test or
+ *  benchmark the emulated implementation on systems that don't need it. */
+template <typename T>
+struct EmulatedFutexAtomic : public std::atomic<T> {
+  EmulatedFutexAtomic() noexcept = default;
+  constexpr /* implicit */ EmulatedFutexAtomic(T init) noexcept
+      : std::atomic<T>(init) {}
+  // It doesn't copy or move
+  EmulatedFutexAtomic(EmulatedFutexAtomic&& rhs) = delete;
+};
+
+} // namespace detail
+} // namespace folly
+
+#include <folly/detail/Futex-inl.h>
diff --git a/third-party/folly/folly/functional/Invoke.h b/third-party/folly/folly/functional/Invoke.h
new file mode 100644
index 00000000000..67c55284377
--- /dev/null
+++ b/third-party/folly/folly/functional/Invoke.h
@@ -0,0 +1,40 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Traits.h>
+
+#include <functional>
+#include <type_traits>
+
+namespace folly {
+namespace invoke_detail {
+template <typename F, typename... Args>
+using invoke_result_ = decltype(std::declval<F>()(std::declval<Args>()...));
+
+template <typename Void, typename F, typename... Args>
+struct is_invocable : std::false_type {};
+
+template <typename F, typename... Args>
+struct is_invocable<void_t<invoke_result_<F, Args...>>, F, Args...>
+    : std::true_type {};
+
+template <typename Void, typename R, typename F, typename... Args>
+struct is_invocable_r : std::false_type {};
+
+template <typename R, typename F, typename... Args>
+struct is_invocable_r<void_t<invoke_result_<F, Args...>>, R, F, Args...>
+    : std::is_convertible<invoke_result_<F, Args...>, R> {};
+} // namespace invoke_detail
+
+//  mimic: std::is_invocable, C++17
+template <typename F, typename... Args>
+struct is_invocable : invoke_detail::is_invocable<void, F, Args...> {};
+
+//  mimic: std::is_invocable_r, C++17
+template <typename R, typename F, typename... Args>
+struct is_invocable_r : invoke_detail::is_invocable_r<void, R, F, Args...> {};
+} // namespace folly
diff --git a/third-party/folly/folly/hash/Hash.h b/third-party/folly/folly/hash/Hash.h
new file mode 100644
index 00000000000..ca221e5c0b3
--- /dev/null
+++ b/third-party/folly/folly/hash/Hash.h
@@ -0,0 +1,29 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+namespace folly {
+namespace hash {
+
+/*
+ * Thomas Wang 64 bit mix hash function
+ */
+
+inline uint64_t twang_mix64(uint64_t key) noexcept {
+  key = (~key) + (key << 21); // key *= (1 << 21) - 1; key -= 1;
+  key = key ^ (key >> 24);
+  key = key + (key << 3) + (key << 8); // key *= 1 + (1 << 3) + (1 << 8)
+  key = key ^ (key >> 14);
+  key = key + (key << 2) + (key << 4); // key *= 1 + (1 << 2) + (1 << 4)
+  key = key ^ (key >> 28);
+  key = key + (key << 31); // key *= 1 + (1 << 31)
+  return key;
+}
+
+} // namespace hash
+} // namespace folly
diff --git a/third-party/folly/folly/lang/Align.h b/third-party/folly/folly/lang/Align.h
new file mode 100644
index 00000000000..5257e2f6fd1
--- /dev/null
+++ b/third-party/folly/folly/lang/Align.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+namespace folly {
+
+//  Memory locations within the same cache line are subject to destructive
+//  interference, also known as false sharing, which is when concurrent
+//  accesses to these different memory locations from different cores, where at
+//  least one of the concurrent accesses is or involves a store operation,
+//  induce contention and harm performance.
+//
+//  Microbenchmarks indicate that pairs of cache lines also see destructive
+//  interference under heavy use of atomic operations, as observed for atomic
+//  increment on Sandy Bridge.
+//
+//  We assume a cache line size of 64, so we use a cache line pair size of 128
+//  to avoid destructive interference.
+//
+//  mimic: std::hardware_destructive_interference_size, C++17
+constexpr std::size_t hardware_destructive_interference_size = 128;
+
+//  Memory locations within the same cache line are subject to constructive
+//  interference, also known as true sharing, which is when accesses to some
+//  memory locations induce all memory locations within the same cache line to
+//  be cached, benefiting subsequent accesses to different memory locations
+//  within the same cache line and heping performance.
+//
+//  mimic: std::hardware_constructive_interference_size, C++17
+constexpr std::size_t hardware_constructive_interference_size = 64;
+
+} // namespace folly
+
diff --git a/third-party/folly/folly/lang/Bits.h b/third-party/folly/folly/lang/Bits.h
new file mode 100644
index 00000000000..f3abeffc429
--- /dev/null
+++ b/third-party/folly/folly/lang/Bits.h
@@ -0,0 +1,30 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Traits.h>
+
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+namespace folly {
+
+template <
+    typename To,
+    typename From,
+    _t<std::enable_if<
+        sizeof(From) == sizeof(To) && std::is_trivial<To>::value &&
+            is_trivially_copyable<From>::value,
+        int>> = 0>
+To bit_cast(const From& src) noexcept {
+  To to;
+  std::memcpy(&to, &src, sizeof(From));
+  return to;
+}
+
+} // namespace folly
+
diff --git a/third-party/folly/folly/lang/Launder.h b/third-party/folly/folly/lang/Launder.h
new file mode 100644
index 00000000000..9247e3e3346
--- /dev/null
+++ b/third-party/folly/folly/lang/Launder.h
@@ -0,0 +1,51 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <new>
+
+#include <folly/Portability.h>
+
+/***
+ *  include or backport:
+ *  * std::launder
+ */
+
+namespace folly {
+
+/**
+ * Approximate backport from C++17 of std::launder. It should be `constexpr`
+ * but that can't be done without specific support from the compiler.
+ */
+template <typename T>
+FOLLY_NODISCARD inline T* launder(T* in) noexcept {
+#if FOLLY_HAS_BUILTIN(__builtin_launder) || __GNUC__ >= 7
+  // The builtin has no unwanted side-effects.
+  return __builtin_launder(in);
+#elif __GNUC__
+  // This inline assembler block declares that `in` is an input and an output,
+  // so the compiler has to assume that it has been changed inside the block.
+  __asm__("" : "+r"(in));
+  return in;
+#elif defined(_WIN32)
+  // MSVC does not currently have optimizations around const members of structs.
+  // _ReadWriteBarrier() will prevent compiler reordering memory accesses.
+  _ReadWriteBarrier();
+  return in;
+#else
+  static_assert(
+      false, "folly::launder is not implemented for this environment");
+#endif
+}
+
+/* The standard explicitly forbids laundering these */
+void launder(void*) = delete;
+void launder(void const*) = delete;
+void launder(void volatile*) = delete;
+void launder(void const volatile*) = delete;
+template <typename T, typename... Args>
+void launder(T (*)(Args...)) = delete;
+} // namespace folly
diff --git a/third-party/folly/folly/portability/Asm.h b/third-party/folly/folly/portability/Asm.h
new file mode 100644
index 00000000000..cca16858626
--- /dev/null
+++ b/third-party/folly/folly/portability/Asm.h
@@ -0,0 +1,28 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Portability.h>
+
+#include <cstdint>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace folly {
+inline void asm_volatile_pause() {
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+  ::_mm_pause();
+#elif defined(__i386__) || FOLLY_X64
+  asm volatile("pause");
+#elif FOLLY_AARCH64 || defined(__arm__)
+  asm volatile("yield");
+#elif FOLLY_PPC64
+  asm volatile("or 27,27,27");
+#endif
+}
+} // namespace folly
diff --git a/third-party/folly/folly/portability/SysSyscall.h b/third-party/folly/folly/portability/SysSyscall.h
new file mode 100644
index 00000000000..fa969deb1be
--- /dev/null
+++ b/third-party/folly/folly/portability/SysSyscall.h
@@ -0,0 +1,10 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef _WIN32
+#include <sys/syscall.h>
+#endif
diff --git a/third-party/folly/folly/portability/SysTypes.h b/third-party/folly/folly/portability/SysTypes.h
new file mode 100644
index 00000000000..7beb68cfbe2
--- /dev/null
+++ b/third-party/folly/folly/portability/SysTypes.h
@@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <sys/types.h>
+
+#ifdef _WIN32
+#include <basetsd.h> // @manual
+
+#define HAVE_MODE_T 1
+
+// This is a massive pain to have be an `int` due to the pthread implementation
+// we support, but it's far more compatible with the rest of the windows world
+// as an `int` than it would be as a `void*`
+using pid_t = int;
+// This isn't actually supposed to be defined here, but it's the most
+// appropriate place without defining a portability header for stdint.h
+// with just this single typedef.
+using ssize_t = SSIZE_T;
+// The Windows headers don't define this anywhere, nor do any of the libs
+// that Folly depends on, so define it here.
+using mode_t = unsigned short;
+#endif
diff --git a/third-party/folly/folly/synchronization/AtomicNotification-inl.h b/third-party/folly/folly/synchronization/AtomicNotification-inl.h
new file mode 100644
index 00000000000..c0b143d0a22
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicNotification-inl.h
@@ -0,0 +1,138 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/detail/Futex.h>
+#include <folly/synchronization/ParkingLot.h>
+
+#include <condition_variable>
+#include <cstdint>
+
+namespace folly {
+namespace detail {
+namespace atomic_notification {
+/**
+ * We use Futex<std::atomic> as the alias that has the lowest performance
+ * overhead with respect to atomic notifications.  Assert that
+ * atomic_uint_fast_wait_t is the same as Futex<std::atomic>
+ */
+static_assert(std::is_same<atomic_uint_fast_wait_t, Futex<std::atomic>>{}, "");
+
+/**
+ * Implementation and specializations for the atomic_wait() family of
+ * functions
+ */
+inline std::cv_status toCvStatus(FutexResult result) {
+  return (result == FutexResult::TIMEDOUT) ? std::cv_status::timeout
+                                           : std::cv_status::no_timeout;
+}
+inline std::cv_status toCvStatus(ParkResult result) {
+  return (result == ParkResult::Timeout) ? std::cv_status::timeout
+                                         : std::cv_status::no_timeout;
+}
+
+// ParkingLot instantiation for futex management
+extern ParkingLot<std::uint32_t> parkingLot;
+
+template <template <typename...> class Atom, typename... Args>
+void atomic_wait_impl(
+    const Atom<std::uint32_t, Args...>* atomic,
+    std::uint32_t expected) {
+  futexWait(atomic, expected);
+  return;
+}
+
+template <template <typename...> class Atom, typename Integer, typename... Args>
+void atomic_wait_impl(const Atom<Integer, Args...>* atomic, Integer expected) {
+  static_assert(!std::is_same<Integer, std::uint32_t>{}, "");
+  parkingLot.park(
+      atomic, -1, [&] { return atomic->load() == expected; }, [] {});
+}
+
+template <
+    template <typename...> class Atom,
+    typename... Args,
+    typename Clock,
+    typename Duration>
+std::cv_status atomic_wait_until_impl(
+    const Atom<std::uint32_t, Args...>* atomic,
+    std::uint32_t expected,
+    const std::chrono::time_point<Clock, Duration>& deadline) {
+  return toCvStatus(futexWaitUntil(atomic, expected, deadline));
+}
+
+template <
+    template <typename...> class Atom,
+    typename Integer,
+    typename... Args,
+    typename Clock,
+    typename Duration>
+std::cv_status atomic_wait_until_impl(
+    const Atom<Integer, Args...>* atomic,
+    Integer expected,
+    const std::chrono::time_point<Clock, Duration>& deadline) {
+  static_assert(!std::is_same<Integer, std::uint32_t>{}, "");
+  return toCvStatus(parkingLot.park_until(
+      atomic, -1, [&] { return atomic->load() == expected; }, [] {}, deadline));
+}
+
+template <template <typename...> class Atom, typename... Args>
+void atomic_notify_one_impl(const Atom<std::uint32_t, Args...>* atomic) {
+  futexWake(atomic, 1);
+  return;
+}
+
+template <template <typename...> class Atom, typename Integer, typename... Args>
+void atomic_notify_one_impl(const Atom<Integer, Args...>* atomic) {
+  static_assert(!std::is_same<Integer, std::uint32_t>{}, "");
+  parkingLot.unpark(atomic, [&](std::uint32_t data) {
+    assert(data == std::numeric_limits<std::uint32_t>::max());
+    return UnparkControl::RemoveBreak;
+  });
+}
+
+template <template <typename...> class Atom, typename... Args>
+void atomic_notify_all_impl(const Atom<std::uint32_t, Args...>* atomic) {
+  futexWake(atomic);
+  return;
+}
+
+template <template <typename...> class Atom, typename Integer, typename... Args>
+void atomic_notify_all_impl(const Atom<Integer, Args...>* atomic) {
+  static_assert(!std::is_same<Integer, std::uint32_t>{}, "");
+  parkingLot.unpark(atomic, [&](std::uint32_t data) {
+    assert(data == std::numeric_limits<std::uint32_t>::max());
+    return UnparkControl::RemoveContinue;
+  });
+}
+} // namespace atomic_notification
+} // namespace detail
+
+template <typename Integer>
+void atomic_wait(const std::atomic<Integer>* atomic, Integer expected) {
+  detail::atomic_notification::atomic_wait_impl(atomic, expected);
+}
+
+template <typename Integer, typename Clock, typename Duration>
+std::cv_status atomic_wait_until(
+    const std::atomic<Integer>* atomic,
+    Integer expected,
+    const std::chrono::time_point<Clock, Duration>& deadline) {
+  return detail::atomic_notification::atomic_wait_until_impl(
+      atomic, expected, deadline);
+}
+
+template <typename Integer>
+void atomic_notify_one(const std::atomic<Integer>* atomic) {
+  detail::atomic_notification::atomic_notify_one_impl(atomic);
+}
+
+template <typename Integer>
+void atomic_notify_all(const std::atomic<Integer>* atomic) {
+  detail::atomic_notification::atomic_notify_all_impl(atomic);
+}
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/AtomicNotification.cpp b/third-party/folly/folly/synchronization/AtomicNotification.cpp
new file mode 100644
index 00000000000..b50875cd56b
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicNotification.cpp
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/AtomicNotification.h>
+
+#include <cstdint>
+
+namespace folly {
+namespace detail {
+namespace atomic_notification {
+
+// ParkingLot instance used for the atomic_wait() family of functions
+//
+// This has been defined as a static object (as opposed to allocated to avoid
+// destruction order problems) because of possible uses coming from
+// allocation-sensitive contexts.
+ParkingLot<std::uint32_t> parkingLot;
+
+} // namespace atomic_notification
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/AtomicNotification.h b/third-party/folly/folly/synchronization/AtomicNotification.h
new file mode 100644
index 00000000000..af87852cbb3
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicNotification.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+
+namespace folly {
+
+/**
+ * The behavior of the atomic_wait() family of functions is semantically
+ * identical to futex().  Correspondingly, calling atomic_notify_one(),
+ * atomic_notify_all() is identical to futexWake() with 1 and
+ * std::numeric_limits<int>::max() respectively
+ *
+ * The difference here compared to the futex API above is that it works with
+ * all types of atomic widths.  When a 32 bit atomic integer is used, the
+ * implementation falls back to using futex() if possible, and the
+ * compatibility implementation for non-linux systems otherwise.  For all
+ * other integer widths, the compatibility implementation is used
+ *
+ * The templating of this API is changed from the standard in the following
+ * ways
+ *
+ * - At the time of writing, libstdc++'s implementation of std::atomic<> does
+ *   not include the value_type alias.  So we rely on the atomic type being a
+ *   template class such that the first type is the underlying value type
+ * - The Atom parameter allows this API to be compatible with
+ *   DeterministicSchedule testing.
+ * - atomic_wait_until() does not exist in the linked paper, the version here
+ *   is identical to futexWaitUntil() and returns std::cv_status
+ */
+//  mimic: std::atomic_wait, p1135r0
+template <typename Integer>
+void atomic_wait(const std::atomic<Integer>* atomic, Integer expected);
+template <typename Integer, typename Clock, typename Duration>
+std::cv_status atomic_wait_until(
+    const std::atomic<Integer>* atomic,
+    Integer expected,
+    const std::chrono::time_point<Clock, Duration>& deadline);
+
+//  mimic: std::atomic_notify_one, p1135r0
+template <typename Integer>
+void atomic_notify_one(const std::atomic<Integer>* atomic);
+//  mimic: std::atomic_notify_all, p1135r0
+template <typename Integer>
+void atomic_notify_all(const std::atomic<Integer>* atomic);
+
+//  mimic: std::atomic_uint_fast_wait_t, p1135r0
+using atomic_uint_fast_wait_t = std::atomic<std::uint32_t>;
+
+} // namespace folly
+
+#include <folly/synchronization/AtomicNotification-inl.h>
diff --git a/third-party/folly/folly/synchronization/AtomicUtil-inl.h b/third-party/folly/folly/synchronization/AtomicUtil-inl.h
new file mode 100644
index 00000000000..4c10d845104
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicUtil-inl.h
@@ -0,0 +1,260 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Portability.h>
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <tuple>
+#include <type_traits>
+
+#if _WIN32
+#include <intrin.h>
+#endif
+
+namespace folly {
+namespace detail {
+
+// TODO: Remove the non-default implementations when both gcc and clang
+// can recognize single bit set/reset patterns and compile them down to locked
+// bts and btr instructions.
+//
+// Currently, at the time of writing it seems like gcc7 and greater can make
+// this optimization and clang cannot - https://gcc.godbolt.org/z/Q83rxX
+
+template <typename Atomic>
+bool atomic_fetch_set_default(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  using Integer = decltype(atomic.load());
+  auto mask = Integer{0b1} << static_cast<Integer>(bit);
+  return (atomic.fetch_or(mask, order) & mask);
+}
+
+template <typename Atomic>
+bool atomic_fetch_reset_default(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  using Integer = decltype(atomic.load());
+  auto mask = Integer{0b1} << static_cast<Integer>(bit);
+  return (atomic.fetch_and(~mask, order) & mask);
+}
+
+/**
+ * A simple trait to determine if the given type is an instantiation of
+ * std::atomic
+ */
+template <typename T>
+struct is_atomic : std::false_type {};
+template <typename Integer>
+struct is_atomic<std::atomic<Integer>> : std::true_type {};
+
+#if FOLLY_X64
+
+#if _MSC_VER
+
+template <typename Integer>
+inline bool atomic_fetch_set_x86(
+    std::atomic<Integer>& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  static_assert(alignof(std::atomic<Integer>) == alignof(Integer), "");
+  static_assert(sizeof(std::atomic<Integer>) == sizeof(Integer), "");
+  assert(atomic.is_lock_free());
+
+  if /* constexpr */ (sizeof(Integer) == 4) {
+    return _interlockedbittestandset(
+        reinterpret_cast<volatile long*>(&atomic), static_cast<long>(bit));
+  } else if /* constexpr */ (sizeof(Integer) == 8) {
+    return _interlockedbittestandset64(
+        reinterpret_cast<volatile long long*>(&atomic),
+        static_cast<long long>(bit));
+  } else {
+    assert(sizeof(Integer) != 4 && sizeof(Integer) != 8);
+    return atomic_fetch_set_default(atomic, bit, order);
+  }
+}
+
+template <typename Atomic>
+inline bool
+atomic_fetch_set_x86(Atomic& atomic, std::size_t bit, std::memory_order order) {
+  static_assert(!std::is_same<Atomic, std::atomic<std::uint32_t>>{}, "");
+  static_assert(!std::is_same<Atomic, std::atomic<std::uint64_t>>{}, "");
+  return atomic_fetch_set_default(atomic, bit, order);
+}
+
+template <typename Integer>
+inline bool atomic_fetch_reset_x86(
+    std::atomic<Integer>& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  static_assert(alignof(std::atomic<Integer>) == alignof(Integer), "");
+  static_assert(sizeof(std::atomic<Integer>) == sizeof(Integer), "");
+  assert(atomic.is_lock_free());
+
+  if /* constexpr */ (sizeof(Integer) == 4) {
+    return _interlockedbittestandreset(
+        reinterpret_cast<volatile long*>(&atomic), static_cast<long>(bit));
+  } else if /* constexpr */ (sizeof(Integer) == 8) {
+    return _interlockedbittestandreset64(
+        reinterpret_cast<volatile long long*>(&atomic),
+        static_cast<long long>(bit));
+  } else {
+    assert(sizeof(Integer) != 4 && sizeof(Integer) != 8);
+    return atomic_fetch_reset_default(atomic, bit, order);
+  }
+}
+
+template <typename Atomic>
+inline bool
+atomic_fetch_reset_x86(Atomic& atomic, std::size_t bit, std::memory_order mo) {
+  static_assert(!std::is_same<Atomic, std::atomic<std::uint32_t>>{}, "");
+  static_assert(!std::is_same<Atomic, std::atomic<std::uint64_t>>{}, "");
+  return atomic_fetch_reset_default(atomic, bit, mo);
+}
+
+#else
+
+template <typename Integer>
+inline bool atomic_fetch_set_x86(
+    std::atomic<Integer>& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  auto previous = false;
+
+  if /* constexpr */ (sizeof(Integer) == 2) {
+    auto pointer = reinterpret_cast<std::uint16_t*>(&atomic);
+    asm volatile("lock; btsw %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint16_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else if /* constexpr */ (sizeof(Integer) == 4) {
+    auto pointer = reinterpret_cast<std::uint32_t*>(&atomic);
+    asm volatile("lock; btsl %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint32_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else if /* constexpr */ (sizeof(Integer) == 8) {
+    auto pointer = reinterpret_cast<std::uint64_t*>(&atomic);
+    asm volatile("lock; btsq %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint64_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else {
+    assert(sizeof(Integer) == 1);
+    return atomic_fetch_set_default(atomic, bit, order);
+  }
+
+  return previous;
+}
+
+template <typename Atomic>
+inline bool
+atomic_fetch_set_x86(Atomic& atomic, std::size_t bit, std::memory_order order) {
+  static_assert(!is_atomic<Atomic>::value, "");
+  return atomic_fetch_set_default(atomic, bit, order);
+}
+
+template <typename Integer>
+inline bool atomic_fetch_reset_x86(
+    std::atomic<Integer>& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  auto previous = false;
+
+  if /* constexpr */ (sizeof(Integer) == 2) {
+    auto pointer = reinterpret_cast<std::uint16_t*>(&atomic);
+    asm volatile("lock; btrw %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint16_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else if /* constexpr */ (sizeof(Integer) == 4) {
+    auto pointer = reinterpret_cast<std::uint32_t*>(&atomic);
+    asm volatile("lock; btrl %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint32_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else if /* constexpr */ (sizeof(Integer) == 8) {
+    auto pointer = reinterpret_cast<std::uint64_t*>(&atomic);
+    asm volatile("lock; btrq %1, (%2); setc %0"
+                 : "=r"(previous)
+                 : "ri"(static_cast<std::uint64_t>(bit)), "r"(pointer)
+                 : "memory", "flags");
+  } else {
+    assert(sizeof(Integer) == 1);
+    return atomic_fetch_reset_default(atomic, bit, order);
+  }
+
+  return previous;
+}
+
+template <typename Atomic>
+bool atomic_fetch_reset_x86(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order) {
+  static_assert(!is_atomic<Atomic>::value, "");
+  return atomic_fetch_reset_default(atomic, bit, order);
+}
+
+#endif
+
+#else
+
+template <typename Atomic>
+bool atomic_fetch_set_x86(Atomic&, std::size_t, std::memory_order) noexcept {
+  // This should never be called on non x86_64 platforms.
+  std::terminate();
+}
+template <typename Atomic>
+bool atomic_fetch_reset_x86(Atomic&, std::size_t, std::memory_order) noexcept {
+  // This should never be called on non x86_64 platforms.
+  std::terminate();
+}
+
+#endif
+
+} // namespace detail
+
+template <typename Atomic>
+bool atomic_fetch_set(Atomic& atomic, std::size_t bit, std::memory_order mo) {
+  using Integer = decltype(atomic.load());
+  static_assert(std::is_unsigned<Integer>{}, "");
+  static_assert(!std::is_const<Atomic>{}, "");
+  assert(bit < (sizeof(Integer) * 8));
+
+  // do the optimized thing on x86 builds.  Also, some versions of TSAN do not
+  // properly instrument the inline assembly, so avoid it when TSAN is enabled
+  if (folly::kIsArchAmd64 && !folly::kIsSanitizeThread) {
+    return detail::atomic_fetch_set_x86(atomic, bit, mo);
+  } else {
+    // otherwise default to the default implementation using fetch_or()
+    return detail::atomic_fetch_set_default(atomic, bit, mo);
+  }
+}
+
+template <typename Atomic>
+bool atomic_fetch_reset(Atomic& atomic, std::size_t bit, std::memory_order mo) {
+  using Integer = decltype(atomic.load());
+  static_assert(std::is_unsigned<Integer>{}, "");
+  static_assert(!std::is_const<Atomic>{}, "");
+  assert(bit < (sizeof(Integer) * 8));
+
+  // do the optimized thing on x86 builds.  Also, some versions of TSAN do not
+  // properly instrument the inline assembly, so avoid it when TSAN is enabled
+  if (folly::kIsArchAmd64 && !folly::kIsSanitizeThread) {
+    return detail::atomic_fetch_reset_x86(atomic, bit, mo);
+  } else {
+    // otherwise default to the default implementation using fetch_and()
+    return detail::atomic_fetch_reset_default(atomic, bit, mo);
+  }
+}
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/AtomicUtil.h b/third-party/folly/folly/synchronization/AtomicUtil.h
new file mode 100644
index 00000000000..95bcf73c5a8
--- /dev/null
+++ b/third-party/folly/folly/synchronization/AtomicUtil.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+namespace folly {
+
+/**
+ * Sets a bit at the given index in the binary representation of the integer
+ * to 1.  Returns the previous value of the bit, so true if the bit was not
+ * changed, false otherwise
+ *
+ * On some architectures, using this is more efficient than the corresponding
+ * std::atomic::fetch_or() with a mask.  For example to set the first (least
+ * significant) bit of an integer, you could do atomic.fetch_or(0b1)
+ *
+ * The efficiency win is only visible in x86 (yet) and comes from the
+ * implementation using the x86 bts instruction when possible.
+ *
+ * When something other than std::atomic is passed, the implementation assumed
+ * incompatibility with this interface and calls Atomic::fetch_or()
+ */
+template <typename Atomic>
+bool atomic_fetch_set(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order = std::memory_order_seq_cst);
+
+/**
+ * Resets a bit at the given index in the binary representation of the integer
+ * to 0.  Returns the previous value of the bit, so true if the bit was
+ * changed, false otherwise
+ *
+ * This follows the same underlying principle and implementation as
+ * fetch_set().  Using the optimized implementation when possible and falling
+ * back to std::atomic::fetch_and() when in debug mode or in an architecture
+ * where an optimization is not possible
+ */
+template <typename Atomic>
+bool atomic_fetch_reset(
+    Atomic& atomic,
+    std::size_t bit,
+    std::memory_order order = std::memory_order_seq_cst);
+
+} // namespace folly
+
+#include <folly/synchronization/AtomicUtil-inl.h>
diff --git a/third-party/folly/folly/synchronization/Baton.h b/third-party/folly/folly/synchronization/Baton.h
new file mode 100644
index 00000000000..6a6403defad
--- /dev/null
+++ b/third-party/folly/folly/synchronization/Baton.h
@@ -0,0 +1,327 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+#include <errno.h>
+#include <stdint.h>
+#include <atomic>
+#include <thread>
+
+#include <folly/detail/Futex.h>
+#include <folly/portability/Asm.h>
+#include <folly/synchronization/WaitOptions.h>
+#include <folly/synchronization/detail/Spin.h>
+
+namespace folly {
+
+/// A Baton allows a thread to block once and be awoken. Captures a
+/// single handoff, and during its lifecycle (from construction/reset
+/// to destruction/reset) a baton must either be post()ed and wait()ed
+/// exactly once each, or not at all.
+///
+/// Baton includes no internal padding, and is only 4 bytes in size.
+/// Any alignment or padding to avoid false sharing is up to the user.
+///
+/// This is basically a stripped-down semaphore that supports only a
+/// single call to sem_post and a single call to sem_wait.
+///
+/// The non-blocking version (MayBlock == false) provides more speed
+/// by using only load acquire and store release operations in the
+/// critical path, at the cost of disallowing blocking.
+///
+/// The current posix semaphore sem_t isn't too bad, but this provides
+/// more a bit more speed, inlining, smaller size, a guarantee that
+/// the implementation won't change, and compatibility with
+/// DeterministicSchedule.  By having a much more restrictive
+/// lifecycle we can also add a bunch of assertions that can help to
+/// catch race conditions ahead of time.
+template <bool MayBlock = true, template <typename> class Atom = std::atomic>
+class Baton {
+ public:
+  static constexpr WaitOptions wait_options() {
+    return {};
+  }
+
+  constexpr Baton() noexcept : state_(INIT) {}
+
+  Baton(Baton const&) = delete;
+  Baton& operator=(Baton const&) = delete;
+
+  /// It is an error to destroy a Baton on which a thread is currently
+  /// wait()ing.  In practice this means that the waiter usually takes
+  /// responsibility for destroying the Baton.
+  ~Baton() noexcept {
+    // The docblock for this function says that it can't be called when
+    // there is a concurrent waiter.  We assume a strong version of this
+    // requirement in which the caller must _know_ that this is true, they
+    // are not allowed to be merely lucky.  If two threads are involved,
+    // the destroying thread must actually have synchronized with the
+    // waiting thread after wait() returned.  To convey causality the the
+    // waiting thread must have used release semantics and the destroying
+    // thread must have used acquire semantics for that communication,
+    // so we are guaranteed to see the post-wait() value of state_,
+    // which cannot be WAITING.
+    //
+    // Note that since we only care about a single memory location,
+    // the only two plausible memory orders here are relaxed and seq_cst.
+    assert(state_.load(std::memory_order_relaxed) != WAITING);
+  }
+
+  bool ready() const noexcept {
+    auto s = state_.load(std::memory_order_acquire);
+    assert(s == INIT || s == EARLY_DELIVERY);
+    return (s == EARLY_DELIVERY);
+  }
+
+  /// Equivalent to destroying the Baton and creating a new one.  It is
+  /// a bug to call this while there is a waiting thread, so in practice
+  /// the waiter will be the one that resets the baton.
+  void reset() noexcept {
+    // See ~Baton for a discussion about why relaxed is okay here
+    assert(state_.load(std::memory_order_relaxed) != WAITING);
+
+    // We use a similar argument to justify the use of a relaxed store
+    // here.  Since both wait() and post() are required to be called
+    // only once per lifetime, no thread can actually call those methods
+    // correctly after a reset() unless it synchronizes with the thread
+    // that performed the reset().  If a post() or wait() on another thread
+    // didn't synchronize, then regardless of what operation we performed
+    // here there would be a race on proper use of the Baton's spec
+    // (although not on any particular load and store).  Put another way,
+    // we don't need to synchronize here because anybody that might rely
+    // on such synchronization is required by the baton rules to perform
+    // an additional synchronization that has the desired effect anyway.
+    //
+    // There is actually a similar argument to be made about the
+    // constructor, in which the fenceless constructor initialization
+    // of state_ is piggybacked on whatever synchronization mechanism
+    // distributes knowledge of the Baton's existence
+    state_.store(INIT, std::memory_order_relaxed);
+  }
+
+  /// Causes wait() to wake up.  For each lifetime of a Baton (where a
+  /// lifetime starts at construction or reset() and ends at
+  /// destruction or reset()) there can be at most one call to post(),
+  /// in the single poster version.  Any thread may call post().
+  void post() noexcept {
+    if (!MayBlock) {
+      /// Spin-only version
+      ///
+      assert(
+          ((1 << state_.load(std::memory_order_relaxed)) &
+           ((1 << INIT) | (1 << EARLY_DELIVERY))) != 0);
+      state_.store(EARLY_DELIVERY, std::memory_order_release);
+      return;
+    }
+
+    /// May-block versions
+    ///
+    uint32_t before = state_.load(std::memory_order_acquire);
+
+    assert(before == INIT || before == WAITING || before == TIMED_OUT);
+
+    if (before == INIT &&
+        state_.compare_exchange_strong(
+            before,
+            EARLY_DELIVERY,
+            std::memory_order_release,
+            std::memory_order_relaxed)) {
+      return;
+    }
+
+    assert(before == WAITING || before == TIMED_OUT);
+
+    if (before == TIMED_OUT) {
+      return;
+    }
+
+    assert(before == WAITING);
+    state_.store(LATE_DELIVERY, std::memory_order_release);
+    detail::futexWake(&state_, 1);
+  }
+
+  /// Waits until post() has been called in the current Baton lifetime.
+  /// May be called at most once during a Baton lifetime (construction
+  /// |reset until destruction|reset).  If post is called before wait in
+  /// the current lifetime then this method returns immediately.
+  ///
+  /// The restriction that there can be at most one wait() per lifetime
+  /// could be relaxed somewhat without any perf or size regressions,
+  /// but by making this condition very restrictive we can provide better
+  /// checking in debug builds.
+  void wait(const WaitOptions& opt = wait_options()) noexcept {
+    if (try_wait()) {
+      return;
+    }
+
+    auto const deadline = std::chrono::steady_clock::time_point::max();
+    tryWaitSlow(deadline, opt);
+  }
+
+  /// Similar to wait, but doesn't block the thread if it hasn't been posted.
+  ///
+  /// try_wait has the following semantics:
+  /// - It is ok to call try_wait any number times on the same baton until
+  ///   try_wait reports that the baton has been posted.
+  /// - It is ok to call timed_wait or wait on the same baton if try_wait
+  ///   reports that baton hasn't been posted.
+  /// - If try_wait indicates that the baton has been posted, it is invalid to
+  ///   call wait, try_wait or timed_wait on the same baton without resetting
+  ///
+  /// @return       true if baton has been posted, false othewise
+  bool try_wait() const noexcept {
+    return ready();
+  }
+
+  /// Similar to wait, but with a timeout. The thread is unblocked if the
+  /// timeout expires.
+  /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
+  /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
+  /// words, after try_wait_for the caller can't invoke
+  /// wait/try_wait/try_wait_for/try_wait_until
+  /// again on the same baton without resetting it.
+  ///
+  /// @param  timeout       Time until which the thread can block
+  /// @return               true if the baton was posted to before timeout,
+  ///                       false otherwise
+  template <typename Rep, typename Period>
+  bool try_wait_for(
+      const std::chrono::duration<Rep, Period>& timeout,
+      const WaitOptions& opt = wait_options()) noexcept {
+    if (try_wait()) {
+      return true;
+    }
+
+    auto const deadline = std::chrono::steady_clock::now() + timeout;
+    return tryWaitSlow(deadline, opt);
+  }
+
+  /// Similar to wait, but with a deadline. The thread is unblocked if the
+  /// deadline expires.
+  /// Note: Only a single call to wait/try_wait_for/try_wait_until is allowed
+  /// during a baton's life-cycle (from ctor/reset to dtor/reset). In other
+  /// words, after try_wait_until the caller can't invoke
+  /// wait/try_wait/try_wait_for/try_wait_until
+  /// again on the same baton without resetting it.
+  ///
+  /// @param  deadline      Time until which the thread can block
+  /// @return               true if the baton was posted to before deadline,
+  ///                       false otherwise
+  template <typename Clock, typename Duration>
+  bool try_wait_until(
+      const std::chrono::time_point<Clock, Duration>& deadline,
+      const WaitOptions& opt = wait_options()) noexcept {
+    if (try_wait()) {
+      return true;
+    }
+
+    return tryWaitSlow(deadline, opt);
+  }
+
+  /// Alias to try_wait_for. Deprecated.
+  template <typename Rep, typename Period>
+  bool timed_wait(
+      const std::chrono::duration<Rep, Period>& timeout) noexcept {
+    return try_wait_for(timeout);
+  }
+
+  /// Alias to try_wait_until. Deprecated.
+  template <typename Clock, typename Duration>
+  bool timed_wait(
+      const std::chrono::time_point<Clock, Duration>& deadline) noexcept {
+    return try_wait_until(deadline);
+  }
+
+ private:
+  enum State : uint32_t {
+    INIT = 0,
+    EARLY_DELIVERY = 1,
+    WAITING = 2,
+    LATE_DELIVERY = 3,
+    TIMED_OUT = 4,
+  };
+
+  template <typename Clock, typename Duration>
+  bool tryWaitSlow(
+      const std::chrono::time_point<Clock, Duration>& deadline,
+      const WaitOptions& opt) noexcept {
+    switch (detail::spin_pause_until(deadline, opt, [=] { return ready(); })) {
+      case detail::spin_result::success:
+        return true;
+      case detail::spin_result::timeout:
+        return false;
+      case detail::spin_result::advance:
+        break;
+    }
+
+    if (!MayBlock) {
+      switch (detail::spin_yield_until(deadline, [=] { return ready(); })) {
+        case detail::spin_result::success:
+          return true;
+        case detail::spin_result::timeout:
+          return false;
+        case detail::spin_result::advance:
+          break;
+      }
+    }
+
+    // guess we have to block :(
+    uint32_t expected = INIT;
+    if (!state_.compare_exchange_strong(
+            expected,
+            WAITING,
+            std::memory_order_relaxed,
+            std::memory_order_relaxed)) {
+      // CAS failed, last minute reprieve
+      assert(expected == EARLY_DELIVERY);
+      // TODO: move the acquire to the compare_exchange failure load after C++17
+      std::atomic_thread_fence(std::memory_order_acquire);
+      return true;
+    }
+
+    while (true) {
+      auto rv = detail::futexWaitUntil(&state_, WAITING, deadline);
+
+      // Awoken by the deadline passing.
+      if (rv == detail::FutexResult::TIMEDOUT) {
+        assert(deadline != (std::chrono::time_point<Clock, Duration>::max()));
+        state_.store(TIMED_OUT, std::memory_order_release);
+        return false;
+      }
+
+      // Probably awoken by a matching wake event, but could also by awoken
+      // by an asynchronous signal or by a spurious wakeup.
+      //
+      // state_ is the truth even if FUTEX_WAIT reported a matching
+      // FUTEX_WAKE, since we aren't using type-stable storage and we
+      // don't guarantee reuse.  The scenario goes like this: thread
+      // A's last touch of a Baton is a call to wake(), which stores
+      // LATE_DELIVERY and gets an unlucky context switch before delivering
+      // the corresponding futexWake.  Thread B sees LATE_DELIVERY
+      // without consuming a futex event, because it calls futexWait
+      // with an expected value of WAITING and hence doesn't go to sleep.
+      // B returns, so the Baton's memory is reused and becomes another
+      // Baton (or a reuse of this one).  B calls futexWait on the new
+      // Baton lifetime, then A wakes up and delivers a spurious futexWake
+      // to the same memory location.  B's futexWait will then report a
+      // consumed wake event even though state_ is still WAITING.
+      //
+      // It would be possible to add an extra state_ dance to communicate
+      // that the futexWake has been sent so that we can be sure to consume
+      // it before returning, but that would be a perf and complexity hit.
+      uint32_t s = state_.load(std::memory_order_acquire);
+      assert(s == WAITING || s == LATE_DELIVERY);
+      if (s == LATE_DELIVERY) {
+        return true;
+      }
+    }
+  }
+
+  detail::Futex<Atom> state_;
+};
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/DistributedMutex-inl.h b/third-party/folly/folly/synchronization/DistributedMutex-inl.h
new file mode 100644
index 00000000000..8eedb9cd346
--- /dev/null
+++ b/third-party/folly/folly/synchronization/DistributedMutex-inl.h
@@ -0,0 +1,1703 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/DistributedMutex.h>
+
+#include <folly/ConstexprMath.h>
+#include <folly/Portability.h>
+#include <folly/ScopeGuard.h>
+#include <folly/Utility.h>
+#include <folly/chrono/Hardware.h>
+#include <folly/detail/Futex.h>
+#include <folly/lang/Align.h>
+#include <folly/lang/Bits.h>
+#include <folly/portability/Asm.h>
+#include <folly/synchronization/AtomicNotification.h>
+#include <folly/synchronization/AtomicUtil.h>
+#include <folly/synchronization/detail/InlineFunctionRef.h>
+#include <folly/synchronization/detail/Sleeper.h>
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <stdexcept>
+#include <thread>
+#include <utility>
+
+namespace folly {
+namespace detail {
+namespace distributed_mutex {
+// kUnlocked is used to show unlocked state
+//
+// When locking threads encounter kUnlocked in the underlying storage, they
+// can just acquire the lock without any further effort
+constexpr auto kUnlocked = std::uintptr_t{0b0};
+// kLocked is used to show that the mutex is currently locked, and future
+// attempts to lock the mutex should enqueue on the central storage
+//
+// Locking threads find this on central storage only when there is a
+// contention chain that is undergoing wakeups, in every other case, a locker
+// will either find kUnlocked or an arbitrary address with the kLocked bit set
+constexpr auto kLocked = std::uintptr_t{0b1};
+// kTimedWaiter is set when there is at least one timed waiter on the mutex
+//
+// Timed waiters do not follow the sleeping strategy employed by regular,
+// non-timed threads.  They sleep on the central mutex atomic through an
+// extended futex() interface that allows sleeping with the same semantics for
+// non-standard integer widths
+//
+// When a regular non-timed thread unlocks or enqueues on the mutex, and sees
+// a timed waiter, it takes ownership of all the timed waiters.  The thread
+// that has taken ownership of the timed waiter releases the timed waiters
+// when it gets a chance at the critical section.  At which point it issues a
+// wakeup to single timed waiter, timed waiters always issue wake() calls to
+// other timed waiters
+constexpr auto kTimedWaiter = std::uintptr_t{0b10};
+
+// kUninitialized means that the thread has just enqueued, and has not yet
+// gotten to initializing itself with the address of its successor
+//
+// this becomes significant for threads that are trying to wake up the
+// uninitialized thread, if they see that the thread is not yet initialized,
+// they can do nothing but spin, and wait for the thread to get initialized
+//
+// This also plays a role in the functioning of flat combining as implemented
+// in DistributedMutex.  When a thread owning the lock goes through the
+// contention chain to either unlock the mutex or combine critical sections
+// from the other end.  The presence of kUninitialized means that the
+// combining thread is not able to make progress after this point.  So we
+// transfer the lock.
+constexpr auto kUninitialized = std::uint32_t{0b0};
+// kWaiting will be set in the waiter's futex structs while they are spinning
+// while waiting for the mutex
+constexpr auto kWaiting = std::uint32_t{0b1};
+// kWake will be set by threads that are waking up waiters that have enqueued
+constexpr auto kWake = std::uint32_t{0b10};
+// kSkipped will be set by a waker when they see that a waiter has been
+// preempted away by the kernel, in this case the thread that got skipped will
+// have to wake up and put itself back on the queue
+constexpr auto kSkipped = std::uint32_t{0b11};
+// kAboutToWait will be set by a waiter that enqueues itself with the purpose
+// of waiting on a futex
+constexpr auto kAboutToWait = std::uint32_t{0b100};
+// kSleeping will be set by a waiter right before enqueueing on a futex.  When
+// a thread wants to wake up a waiter that has enqueued on a futex, it should
+// set the futex to contain kWake
+//
+// a thread that is unlocking and wants to skip over a sleeping thread also
+// calls futex_.exchange(kSleeping) on the sleeping thread's futex word.  It
+// does this to 1. detect whether the sleeping thread had actually gone to
+// sleeping on the futex word so it can skip it, and 2. to synchronize with
+// other non atomic writes in the sleeping thread's context (such as the write
+// to track the next waiting thread).
+//
+// We reuse kSleeping instead of say using another constant kEarlyDelivery to
+// avoid situations where a thread has to enter kernel mode due to calling
+// futexWait() twice because of the presence of a waking thread.  This
+// situation can arise when an unlocking thread goes to skip over a sleeping
+// thread, sees that the thread has slept and move on, but the sleeping thread
+// had not yet entered futex().  This interleaving causes the thread calling
+// futex() to return spuriously, as the futex word is not what it should be
+constexpr auto kSleeping = std::uint32_t{0b101};
+// kCombined is set by the lock holder to let the waiter thread know that its
+// combine request was successfully completed by the lock holder.  A
+// successful combine means that the thread requesting the combine operation
+// does not need to unlock the mutex; in fact, doing so would be an error.
+constexpr auto kCombined = std::uint32_t{0b111};
+// kCombineUninitialized is like kUninitialized but is set by a thread when it
+// enqueues in hopes of getting its critical section combined with the lock
+// holder
+constexpr auto kCombineUninitialized = std::uint32_t{0b1000};
+// kCombineWaiting is set by a thread when it is ready to have its combine
+// record fulfilled by the lock holder.  In particular, this signals to the
+// lock holder that the thread has set its next_ pointer in the contention
+// chain
+constexpr auto kCombineWaiting = std::uint32_t{0b1001};
+// kExceptionOccurred is set on the waiter futex when the remote task throws
+// an exception.  It is the caller's responsibility to retrieve the exception
+// and rethrow it in their own context.  Note that when the caller uses a
+// noexcept function as their critical section, they can avoid checking for
+// this value
+//
+// This allows us to avoid all cost of exceptions in the memory layout of the
+// fast path (no errors) as exceptions are stored as an std::exception_ptr in
+// the same union that stores the return value of the critical section.  We
+// also avoid all CPU overhead because the combiner uses a try-catch block
+// without any additional branching to handle exceptions
+constexpr auto kExceptionOccurred = std::uint32_t{0b1010};
+
+// The number of spins that we are allowed to do before we resort to marking a
+// thread as having slept
+//
+// This is just a magic number from benchmarks
+constexpr auto kScheduledAwaySpinThreshold = std::chrono::nanoseconds{200};
+// The maximum number of spins before a thread starts yielding its processor
+// in hopes of getting skipped
+constexpr auto kMaxSpins = 4000;
+// The maximum number of contention chains we can resolve with flat combining.
+// After this number of contention chains, the mutex falls back to regular
+// two-phased mutual exclusion to ensure that we don't starve the combiner
+// thread
+constexpr auto kMaxCombineIterations = 2;
+
+/**
+ * Write only data that is available to the thread that is waking up another.
+ * Only the waking thread is allowed to write to this, the thread to be woken
+ * is allowed to read from this after a wakeup has been issued
+ */
+template <template <typename> class Atomic>
+class WakerMetadata {
+ public:
+  explicit WakerMetadata(
+      std::uintptr_t waker = 0,
+      std::uintptr_t waiters = 0,
+      std::uint32_t sleeper = kUninitialized)
+    : waker_{waker}, waiters_{waiters}, sleeper_{sleeper} {}
+
+  // This is the thread that initiated wakeups for the contention chain.
+  // There can only ever be one thread that initiates the wakeup for a
+  // chain in the spin only version of this mutex.  When a thread that just
+  // woke up sees this as the next thread to wake up, it knows that it is the
+  // terminal node in the contention chain.  This means that it was the one
+  // that took off the thread that had acquired the mutex off the centralized
+  // state.  Therefore, the current thread is the last in its contention
+  // chain.  It will fall back to centralized storage to pick up the next
+  // waiter or release the mutex
+  //
+  // When we move to a full sleeping implementation, this might need to change
+  // to a small_vector<> to account for failed wakeups, or we can put threads
+  // to sleep on the central futex, which is an easier implementation
+  // strategy.  Although, since this is allocated on the stack, we can set a
+  // prohitively large threshold to avoid heap allocations, this strategy
+  // however, might cause increased cache misses on wakeup signalling
+  std::uintptr_t waker_{0};
+  // the list of threads that the waker had previously seen to be sleeping on
+  // a futex(),
+  //
+  // this is given to the current thread as a means to pass on
+  // information.  When the current thread goes to unlock the mutex and does
+  // not see contention, it should go and wake up the head of this list.  If
+  // the current thread sees a contention chain on the mutex, it should pass
+  // on this list to the next thread that gets woken up
+  std::uintptr_t waiters_{0};
+  // The futex that this waiter will sleep on
+  //
+  // how can we reuse futex_ from above for futex management?
+  Futex<Atomic> sleeper_{kUninitialized};
+};
+
+/**
+ * Type of the type-erased callable that is used for combining from the lock
+ * holder's end.  This has 48 bytes of inline storage that can be used to
+ * minimize cache misses when combining
+ */
+using CombineFunction = detail::InlineFunctionRef<void(), 48>;
+
+/**
+ * Waiter encapsulates the state required for waiting on the mutex, this
+ * contains potentially heavy state and is intended to be allocated on the
+ * stack as part of a lock() function call
+ *
+ * To ensure that synchronization does not cause unintended side effects on
+ * the rest of the thread stack (eg. metadata in lockImplementation(), or any
+ * other data in the user's thread), we aggresively pad this struct and use
+ * custom alignment internally to ensure that the relevant data fits within a
+ * single cacheline.  The added alignment here also gives us some room to
+ * wiggle in the bottom few bits of the mutex, where we store extra metadata
+ */
+template <template <typename> class Atomic>
+class Waiter {
+ public:
+  Waiter() {}
+  Waiter(Waiter&&) = delete;
+  Waiter(const Waiter&) = delete;
+  Waiter& operator=(Waiter&&) = delete;
+  Waiter& operator=(const Waiter&) = delete;
+
+  void initialize(std::uint64_t futex, CombineFunction task) {
+    // we only initialize the function if we were actually given a non-null
+    // task, otherwise
+    if (task) {
+      assert(futex == kCombineUninitialized);
+      new (&function_) CombineFunction(task);
+    } else {
+      assert((futex == kUninitialized) || (futex == kAboutToWait));
+      new (&metadata_) WakerMetadata<Atomic>{};
+    }
+
+    // this pedantic store is needed to ensure that the waking thread
+    // synchronizes with the state in the waiter struct when it loads the
+    // value of the futex word
+    //
+    // on x86, this gets optimized away to just a regular store, it might be
+    // needed on platforms where explicit acquire-release barriers are
+    // required for synchronization
+    //
+    // note that we release here at the end of the constructor because
+    // construction is complete here, any thread that acquires this release
+    // will see a well constructed wait node
+    futex_.store(futex, std::memory_order_release);
+  }
+
+  std::array<std::uint8_t, hardware_destructive_interference_size> padding1;
+  // the atomic that this thread will spin on while waiting for the mutex to
+  // be unlocked
+  alignas(hardware_destructive_interference_size) Atomic<std::uint64_t> futex_{
+      kUninitialized};
+  // The successor of this node.  This will be the thread that had its address
+  // on the mutex previously
+  //
+  // We can do without making this atomic since the remote thread synchronizes
+  // on the futex variable above.  If this were not atomic, the remote thread
+  // would only be allowed to read from it after the waiter has moved into the
+  // waiting state to avoid risk of a load racing with a write.  However, it
+  // helps to make this atomic because we can use an unconditional load and make
+  // full use of the load buffer to coalesce both reads into a single clock
+  // cycle after the line arrives in the combiner core.  This is a heavily
+  // contended line, so an RFO from the enqueueing thread is highly likely and
+  // has the potential to cause an immediate invalidation; blocking the combiner
+  // thread from making progress until the line is pulled back to read this
+  // value
+  //
+  // Further, making this atomic prevents the compiler from making an incorrect
+  // optimization where it does not load the value as written in the code, but
+  // rather dereferences it through a pointer whenever needed (since the value
+  // of the pointer to this is readily available on the stack).  Doing this
+  // causes multiple invalidation requests from the enqueueing thread, blocking
+  // remote progress
+  //
+  // Note that we use relaxed loads and stores, so this should not have any
+  // additional overhead compared to a regular load on most architectures
+  std::atomic<std::uintptr_t> next_{0};
+  // We use an anonymous union for the combined critical section request and
+  // the metadata that will be filled in from the leader's end.  Only one is
+  // active at a time - if a leader decides to combine the requested critical
+  // section into its execution, it will not touch the metadata field.  If a
+  // leader decides to migrate the lock to the waiter, it will not touch the
+  // function
+  //
+  // this allows us to transfer more state when combining a critical section
+  // and reduce the cache misses originating from executing an arbitrary
+  // lambda
+  //
+  // note that this is an anonymous union, not an unnamed union, the members
+  // leak into the surrounding scope
+  union {
+    // metadata for the waker
+    WakerMetadata<Atomic> metadata_;
+    // The critical section that can potentially be combined into the critical
+    // section of the locking thread
+    //
+    // This is kept as a FunctionRef because the original function is preserved
+    // until the lock_combine() function returns.  A consequence of using
+    // FunctionRef here is that we don't need to do any allocations and can
+    // allow users to capture unbounded state into the critical section.  Flat
+    // combining means that the user does not have access to the thread
+    // executing the critical section, so assumptions about thread local
+    // references can be invalidated.  Being able to capture arbitrary state
+    // allows the user to do thread local accesses right before the critical
+    // section and pass them as state to the callable being referenced here
+    CombineFunction function_;
+    // The user is allowed to use a combined critical section that returns a
+    // value.  This buffer is used to implement the value transfer to the
+    // waiting thread.  We reuse the same union because this helps us combine
+    // one synchronization operation with a material value transfer.
+    //
+    // The waker thread needs to synchronize on this cacheline to issue a
+    // wakeup to the waiter, meaning that the entire line needs to be pulled
+    // into the remote core in exclusive mode.  So we reuse the coherence
+    // operation to transfer the return value in addition to the
+    // synchronization signal.  In the case that the user's data item is
+    // small, the data is transferred all inline as part of the same line,
+    // which pretty much arrives into the CPU cache in the same clock cycle or
+    // two after a read-for-ownership request.  This gives us a high chance of
+    // coalescing the entire transitive store buffer together into one cache
+    // coherence operation from the waker's end.  This allows us to make use
+    // of the CPU bus bandwidth which would have otherwise gone to waste.
+    // Benchmarks prove this theory under a wide range of contention, value
+    // sizes, NUMA interactions and processor models
+    //
+    // The current version of the Intel optimization manual confirms this
+    // theory somewhat as well in section 2.3.5.1 (Load and Store Operation
+    // Overview)
+    //
+    //    When an instruction writes data to a memory location [...], the
+    //    processor ensures that it has the line containing this memory location
+    //    is in its L1d cache [...]. If the cache line is not there, it fetches
+    //    from the next levels using a RFO request [...] RFO and storing the
+    //    data happens after instruction retirement.  Therefore, the store
+    //    latency usually does not affect the store instruction itself
+    //
+    // This gives the user the ability to input up to 48 bytes into the
+    // combined critical section through an InlineFunctionRef and output 48
+    // bytes from it basically without any cost.  The type of the entity
+    // stored in the buffer has to be matched by the type erased callable that
+    // the caller has used.  At this point, the caller is still in the
+    // template instantiation leading to the combine request, so it has
+    // knowledge of the return type and can apply the appropriate
+    // reinterpret_cast and launder operation to safely retrieve the data from
+    // this buffer
+    _t<std::aligned_storage<48, 8>> storage_;
+  };
+  std::array<std::uint8_t, hardware_destructive_interference_size> padding2;
+};
+
+/**
+ * A template that helps us differentiate between the different ways to return
+ * a value from a combined critical section.  A return value of type void
+ * cannot be stored anywhere, so we use specializations and pick the right one
+ * switched through std::conditional_t
+ *
+ * This is then used by CoalescedTask and its family of functions to implement
+ * efficient return value transfers to the waiting threads
+ */
+template <typename Func>
+class RequestWithReturn {
+ public:
+  using F = Func;
+  using ReturnType = decltype(std::declval<const Func&>()());
+  explicit RequestWithReturn(Func func) : func_{std::move(func)} {}
+
+  /**
+   * We need to define the destructor here because C++ requires (with good
+   * reason) that a union with non-default destructor be explicitly destroyed
+   * from the surrounding class, as neither the runtime nor compiler have the
+   * knowledge of what to do with a union at the time of destruction
+   *
+   * Each request that has a valid return value set will have the value
+   * retrieved from the get() method, where the value is destroyed.  So we
+   * don't need to destroy it here
+   */
+  ~RequestWithReturn() {}
+
+  /**
+   * This method can be used to return a value from the request.  This returns
+   * the underlying value because return type of the function we were
+   * instantiated with is not void
+   */
+  ReturnType get() && {
+    // when the return value has been processed, we destroy the value
+    // contained in this request.  Using a scope_exit means that we don't have
+    // to worry about storing the value somewhere and causing potentially an
+    // extra move
+    //
+    // note that the invariant here is that this function is only called if the
+    // requesting thread had it's critical section combined, and the value_
+    // member constructed through detach()
+    SCOPE_EXIT {
+      value_.~ReturnType();
+    };
+    return std::move(value_);
+  }
+
+  // this contains a copy of the function the waiter had requested to be
+  // executed as a combined critical section
+  Func func_;
+  // this stores the return value used in the request, we use a union here to
+  // avoid laundering and allow return types that are not default
+  // constructible to be propagated through the execution of the critical
+  // section
+  //
+  // note that this is an anonymous union, the member leaks into the
+  // surrounding scope as a member variable
+  union {
+    ReturnType value_;
+  };
+};
+
+template <typename Func>
+class RequestWithoutReturn {
+ public:
+  using F = Func;
+  using ReturnType = void;
+  explicit RequestWithoutReturn(Func func) : func_{std::move(func)} {}
+
+  /**
+   * In this version of the request class, get() returns nothing as there is
+   * no stored value
+   */
+  void get() && {}
+
+  // this contains a copy of the function the waiter had requested to be
+  // executed as a combined critical section
+  Func func_;
+};
+
+// we need to use std::integral_constant::value here as opposed to
+// std::integral_constant::operator T() because MSVC errors out with the
+// implicit conversion
+template <typename Func>
+using Request = _t<std::conditional<
+    std::is_void<decltype(std::declval<const Func&>()())>::value,
+    RequestWithoutReturn<Func>,
+    RequestWithReturn<Func>>>;
+
+/**
+ * A template that helps us to transform a callable returning a value to one
+ * that returns void so it can be type erased and passed on to the waker.  If
+ * the return value is small enough, it gets coalesced into the wait struct
+ * for optimal data transfer.  When it's not small enough to fit in the waiter
+ * storage buffer, we place it on it's own cacheline with isolation to prevent
+ * false-sharing with the on-stack metadata of the waiter thread
+ *
+ * This helps a combined critical section feel more normal in the case where
+ * the user wants to return a value, for example
+ *
+ *    auto value = mutex_.lock_combine([&]() {
+ *      return data_.value();
+ *    });
+ *
+ * Without this, the user would typically create a dummy object that they
+ * would then assign to from within the lambda.  With return value chaining,
+ * this pattern feels more natural
+ *
+ * Note that it is important to copy the entire callble into this class.
+ * Storing something like a reference instead is not desirable because it does
+ * not allow InlineFunctionRef to use inline storage to represent the user's
+ * callable without extra indirections
+ *
+ * We use std::conditional_t and switch to the right type of task with the
+ * CoalescedTask type alias
+ */
+template <typename Func, typename Waiter>
+class TaskWithCoalesce {
+ public:
+  using ReturnType = decltype(std::declval<const Func&>()());
+  using StorageType = folly::Unit;
+  explicit TaskWithCoalesce(Func func, Waiter& waiter)
+      : func_{std::move(func)}, waiter_(waiter) {}
+
+  void operator()() const {
+    auto value = func_();
+    new (&waiter_.storage_) ReturnType(std::move(value));
+  }
+
+ private:
+  Func func_;
+  Waiter& waiter_;
+
+  static_assert(!std::is_void<ReturnType>{}, "");
+  static_assert(alignof(decltype(waiter_.storage_)) >= alignof(ReturnType), "");
+  static_assert(sizeof(decltype(waiter_.storage_)) >= sizeof(ReturnType), "");
+};
+
+template <typename Func, typename Waiter>
+class TaskWithoutCoalesce {
+ public:
+  using ReturnType = void;
+  using StorageType = folly::Unit;
+  explicit TaskWithoutCoalesce(Func func, Waiter&) : func_{std::move(func)} {}
+
+  void operator()() const {
+    func_();
+  }
+
+ private:
+  Func func_;
+};
+
+template <typename Func, typename Waiter>
+class TaskWithBigReturnValue {
+ public:
+  // Using storage that is aligned on the cacheline boundary helps us avoid a
+  // situation where the data ends up being allocated on two separate
+  // cachelines.  This would require the remote thread to pull in both lines
+  // to issue a write.
+  //
+  // We also isolate the storage by appending some padding to the end to
+  // ensure we avoid false-sharing with the metadata used while the waiter
+  // waits
+  using ReturnType = decltype(std::declval<const Func&>()());
+  static const auto kReturnValueAlignment = folly::kIsMsvc
+      ? 8
+      : folly::constexpr_max(
+            alignof(ReturnType),
+            folly::hardware_destructive_interference_size);
+  using StorageType = _t<std::aligned_storage<
+      sizeof(
+          _t<std::aligned_storage<sizeof(ReturnType), kReturnValueAlignment>>),
+      kReturnValueAlignment>>;
+
+  explicit TaskWithBigReturnValue(Func func, Waiter&)
+      : func_{std::move(func)} {}
+
+  void operator()() const {
+    assert(storage_);
+    auto value = func_();
+    new (storage_) ReturnType(std::move(value));
+  }
+
+  void attach(StorageType* storage) {
+    assert(!storage_);
+    storage_ = storage;
+  }
+
+ private:
+  Func func_;
+  StorageType* storage_{nullptr};
+
+  static_assert(!std::is_void<ReturnType>{}, "");
+  static_assert(sizeof(Waiter::storage_) < sizeof(ReturnType), "");
+};
+
+template <typename T, bool>
+struct Sizeof_;
+template <typename T>
+struct Sizeof_<T, false> : std::integral_constant<std::size_t, sizeof(T)> {};
+template <typename T>
+struct Sizeof_<T, true> : std::integral_constant<std::size_t, 0> {};
+template <typename T>
+struct Sizeof : Sizeof_<T, std::is_void<T>::value> {};
+
+// we need to use std::integral_constant::value here as opposed to
+// std::integral_constant::operator T() because MSVC errors out with the
+// implicit conversion
+template <typename Func, typename Waiter>
+using CoalescedTask = _t<std::conditional<
+    std::is_void<decltype(std::declval<const Func&>()())>::value,
+    TaskWithoutCoalesce<Func, Waiter>,
+    _t<std::conditional<
+        Sizeof<decltype(std::declval<const Func&>()())>::value <=
+            sizeof(Waiter::storage_),
+        TaskWithCoalesce<Func, Waiter>,
+        TaskWithBigReturnValue<Func, Waiter>>>>>;
+
+/**
+ * Given a request and a wait node, coalesce them into a CoalescedTask that
+ * coalesces the return value into the wait node when invoked from a remote
+ * thread
+ *
+ * When given a null request through nullptr_t, coalesce() returns null as well
+ */
+template <typename Waiter>
+std::nullptr_t coalesce(std::nullptr_t&, Waiter&) {
+  return nullptr;
+}
+
+template <
+    typename Request,
+    typename Waiter,
+    typename Func = typename Request::F>
+CoalescedTask<Func, Waiter> coalesce(Request& request, Waiter& waiter) {
+  static_assert(!std::is_same<Request, std::nullptr_t>{}, "");
+  return CoalescedTask<Func, Waiter>{request.func_, waiter};
+}
+
+/**
+ * Given a task, create storage for the return value.  When we get a type
+ * of CoalescedTask, this returns an instance of CoalescedTask::StorageType.
+ * std::nullptr_t otherwise
+ */
+inline std::nullptr_t makeReturnValueStorageFor(std::nullptr_t&) {
+  return {};
+}
+
+template <
+    typename CoalescedTask,
+    typename StorageType = typename CoalescedTask::StorageType>
+StorageType makeReturnValueStorageFor(CoalescedTask&) {
+  return {};
+}
+
+/**
+ * Given a task and storage, attach them together if needed.  This only helps
+ * when we have a task that returns a value bigger than can be coalesced.  In
+ * that case, we need to attach the storage with the task so the return value
+ * can be transferred to this thread from the remote thread
+ */
+template <typename Task, typename Storage>
+void attach(Task&, Storage&) {
+  static_assert(
+      std::is_same<Storage, std::nullptr_t>{} ||
+          std::is_same<Storage, folly::Unit>{},
+      "");
+}
+
+template <
+    typename R,
+    typename W,
+    typename StorageType = typename TaskWithBigReturnValue<R, W>::StorageType>
+void attach(TaskWithBigReturnValue<R, W>& task, StorageType& storage) {
+  task.attach(&storage);
+}
+
+template <typename Request, typename Waiter>
+void throwIfExceptionOccurred(Request&, Waiter& waiter, bool exception) {
+  using Storage = decltype(waiter.storage_);
+  using F = typename Request::F;
+  static_assert(sizeof(Storage) >= sizeof(std::exception_ptr), "");
+  static_assert(alignof(Storage) >= alignof(std::exception_ptr), "");
+
+  // we only need to check for an exception in the waiter struct if the passed
+  // callable is not noexcept
+  //
+  // we need to make another instance of the exception with automatic storage
+  // duration and destroy the exception held in the storage *before throwing* to
+  // avoid leaks.  If we don't destroy the exception_ptr in storage, the
+  // refcount for the internal exception will never hit zero, thereby leaking
+  // memory
+  if ((!noexcept(std::declval<const F&>()()) && exception)) {
+    auto storage = &waiter.storage_;
+    auto exc = folly::launder(reinterpret_cast<std::exception_ptr*>(storage));
+    auto copy = std::move(*exc);
+    exc->std::exception_ptr::~exception_ptr();
+    std::rethrow_exception(std::move(copy));
+  }
+}
+
+/**
+ * Given a CoalescedTask, a wait node and a request.  Detach the return value
+ * into the request from the wait node and task.
+ */
+template <typename Waiter>
+void detach(std::nullptr_t&, Waiter&, bool exception, std::nullptr_t&) {
+  assert(!exception);
+}
+
+template <typename Waiter, typename F>
+void detach(
+    RequestWithoutReturn<F>& request,
+    Waiter& waiter,
+    bool exception,
+    folly::Unit&) {
+  throwIfExceptionOccurred(request, waiter, exception);
+}
+
+template <typename Waiter, typename F>
+void detach(
+    RequestWithReturn<F>& request,
+    Waiter& waiter,
+    bool exception,
+    folly::Unit&) {
+  throwIfExceptionOccurred(request, waiter, exception);
+
+  using ReturnType = typename RequestWithReturn<F>::ReturnType;
+  static_assert(!std::is_same<ReturnType, void>{}, "");
+  static_assert(sizeof(waiter.storage_) >= sizeof(ReturnType), "");
+
+  auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&waiter.storage_));
+  new (&request.value_) ReturnType(std::move(val));
+  val.~ReturnType();
+}
+
+template <typename Waiter, typename F, typename Storage>
+void detach(
+    RequestWithReturn<F>& request,
+    Waiter& waiter,
+    bool exception,
+    Storage& storage) {
+  throwIfExceptionOccurred(request, waiter, exception);
+
+  using ReturnType = typename RequestWithReturn<F>::ReturnType;
+  static_assert(!std::is_same<ReturnType, void>{}, "");
+  static_assert(sizeof(storage) >= sizeof(ReturnType), "");
+
+  auto& val = *folly::launder(reinterpret_cast<ReturnType*>(&storage));
+  new (&request.value_) ReturnType(std::move(val));
+  val.~ReturnType();
+}
+
+/**
+ * Get the time since epoch in nanoseconds
+ *
+ * This is faster than std::chrono::steady_clock because it avoids a VDSO
+ * access to get the timestamp counter
+ *
+ * Note that the hardware timestamp counter on x86, like std::steady_clock is
+ * guaranteed to be monotonically increasing -
+ * https://c9x.me/x86/html/file_module_x86_id_278.html
+ */
+inline std::chrono::nanoseconds time() {
+  return std::chrono::nanoseconds{hardware_timestamp()};
+}
+
+/**
+ * Zero out the other bits used by the implementation and return just an
+ * address from a uintptr_t
+ */
+template <typename Type>
+Type* extractPtr(std::uintptr_t from) {
+  // shift one bit off the end, to get all 1s followed by a single 0
+  auto mask = std::numeric_limits<std::uintptr_t>::max();
+  mask >>= 1;
+  mask <<= 1;
+  assert(!(mask & 0b1));
+
+  return folly::bit_cast<Type*>(from & mask);
+}
+
+/**
+ * Strips the given nanoseconds into only the least significant 56 bits by
+ * moving the least significant 56 bits over by 8 zeroing out the bottom 8
+ * bits to be used as a medium of information transfer for the thread wait
+ * nodes
+ */
+inline std::uint64_t strip(std::chrono::nanoseconds t) {
+  auto time = t.count();
+  return static_cast<std::uint64_t>(time) << 8;
+}
+
+/**
+ * Recover the timestamp value from an integer that has the timestamp encoded
+ * in it
+ */
+inline std::uint64_t recover(std::uint64_t from) {
+  return from >> 8;
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+class DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy {
+ public:
+  // DistributedMutexStateProxy is move constructible and assignable for
+  // convenience
+  DistributedMutexStateProxy(DistributedMutexStateProxy&& other) {
+    *this = std::move(other);
+  }
+
+  DistributedMutexStateProxy& operator=(DistributedMutexStateProxy&& other) {
+    assert(!(*this));
+
+    next_ = folly::exchange(other.next_, nullptr);
+    expected_ = folly::exchange(other.expected_, 0);
+    timedWaiters_ = folly::exchange(other.timedWaiters_, false);
+    combined_ = folly::exchange(other.combined_, false);
+    waker_ = folly::exchange(other.waker_, 0);
+    waiters_ = folly::exchange(other.waiters_, nullptr);
+    ready_ = folly::exchange(other.ready_, nullptr);
+
+    return *this;
+  }
+
+  // The proxy is valid when a mutex acquisition attempt was successful,
+  // lock() is guaranteed to return a valid proxy, try_lock() is not
+  explicit operator bool() const {
+    return expected_;
+  }
+
+  // private:
+  // friend the mutex class, since that will be accessing state private to
+  // this class
+  friend class DistributedMutex<Atomic, TimePublishing>;
+
+  DistributedMutexStateProxy(
+      Waiter<Atomic>* next,
+      std::uintptr_t expected,
+      bool timedWaiter = false,
+      bool combined = false,
+      std::uintptr_t waker = 0,
+      Waiter<Atomic>* waiters = nullptr,
+      Waiter<Atomic>* ready = nullptr)
+      : next_{next},
+        expected_{expected},
+        timedWaiters_{timedWaiter},
+        combined_{combined},
+        waker_{waker},
+        waiters_{waiters},
+        ready_{ready} {}
+
+  // the next thread that is to be woken up, this being null at the time of
+  // unlock() shows that the current thread acquired the mutex without
+  // contention or it was the terminal thread in the queue of threads waking up
+  Waiter<Atomic>* next_{nullptr};
+  // this is the value that the current thread should expect to find on
+  // unlock, and if this value is not there on unlock, the current thread
+  // should assume that other threads are enqueued waiting for the mutex
+  //
+  // note that if the mutex has the same state set at unlock time, and this is
+  // set to an address (and not say kLocked in the case of a terminal waker)
+  // then it must have been the case that no other thread had enqueued itself,
+  // since threads in the domain of this mutex do not share stack space
+  //
+  // if we want to support stack sharing, we can solve the problem by looping
+  // at lock time, and setting a variable that says whether we have acquired
+  // the lock or not perhaps
+  std::uintptr_t expected_{0};
+  // a boolean that will be set when the mutex has timed waiters that the
+  // current thread is responsible for waking, in such a case, the current
+  // thread will issue an atomic_notify_one() call after unlocking the mutex
+  //
+  // note that a timed waiter will itself always have this flag set.  This is
+  // done so we can avoid having to issue a atomic_notify_all() call (and
+  // subsequently a thundering herd) when waking up timed-wait threads
+  bool timedWaiters_{false};
+  // a boolean that contains true if the state proxy is not meant to be passed
+  // to the unlock() function.  This is set only when there is contention and
+  // a thread had asked for its critical section to be combined
+  bool combined_{false};
+  // metadata passed along from the thread that woke this thread up
+  std::uintptr_t waker_{0};
+  // the list of threads that are waiting on a futex
+  //
+  // the current threads is meant to wake up this list of waiters if it is
+  // able to commit an unlock() on the mutex without seeing a contention chain
+  Waiter<Atomic>* waiters_{nullptr};
+  // after a thread has woken up from a futex() call, it will have the rest of
+  // the threads that it were waiting behind it in this list, a thread that
+  // unlocks has to wake up threads from this list if it has any, before it
+  // goes to sleep to prevent pathological unfairness
+  Waiter<Atomic>* ready_{nullptr};
+};
+
+template <template <typename> class Atomic, bool TimePublishing>
+DistributedMutex<Atomic, TimePublishing>::DistributedMutex()
+    : state_{kUnlocked} {}
+
+template <typename Waiter>
+std::uint64_t publish(
+    std::uint64_t spins,
+    bool& shouldPublish,
+    std::chrono::nanoseconds& previous,
+    Waiter& waiter,
+    std::uint32_t waitMode) {
+  // time publishing has some overhead because it executes an atomic exchange on
+  // the futex word.  If this line is in a remote thread (eg.  the combiner),
+  // then each time we publish a timestamp, this thread has to submit an RFO to
+  // the remote core for the cacheline, blocking progress for both threads.
+  //
+  // the remote core uses a store in the fast path - why then does an RFO make a
+  // difference?  The only educated guess we have here is that the added
+  // roundtrip delays draining of the store buffer, which essentially exerts
+  // backpressure on future stores, preventing parallelization
+  //
+  // if we have requested a combine, time publishing is less important as it
+  // only comes into play when the combiner has exhausted their max combine
+  // passes.  So we defer time publishing to the point when the current thread
+  // gets preempted
+  auto current = time();
+  if ((current - previous) >= kScheduledAwaySpinThreshold) {
+    shouldPublish = true;
+  }
+  previous = current;
+
+  // if we have requested a combine, and this is the first iteration of the
+  // wait-loop, we publish a max timestamp to optimistically convey that we have
+  // not yet been preempted (the remote knows the meaning of max timestamps)
+  //
+  // then if we are under the maximum number of spins allowed before sleeping,
+  // we publish the exact timestamp, otherwise we publish the minimum possible
+  // timestamp to force the waking thread to skip us
+  auto now = ((waitMode == kCombineWaiting) && !spins)
+      ? decltype(time())::max()
+      : (spins < kMaxSpins) ? previous : decltype(time())::zero();
+
+  // the wait mode information is published in the bottom 8 bits of the futex
+  // word, the rest contains time information as computed above.  Overflows are
+  // not really a correctness concern because time publishing is only a
+  // heuristic.  This leaves us 56 bits of nanoseconds (2 years) before we hit
+  // two consecutive wraparounds, so the lack of bits to respresent time is
+  // neither a performance nor correctness concern
+  auto data = strip(now) | waitMode;
+  auto signal = (shouldPublish || !spins || (waitMode != kCombineWaiting))
+      ? waiter.futex_.exchange(data, std::memory_order_acq_rel)
+      : waiter.futex_.load(std::memory_order_acquire);
+  return signal & std::numeric_limits<std::uint8_t>::max();
+}
+
+template <typename Waiter>
+bool spin(Waiter& waiter, std::uint32_t& sig, std::uint32_t mode) {
+  auto spins = std::uint64_t{0};
+  auto waitMode = (mode == kCombineUninitialized) ? kCombineWaiting : kWaiting;
+  auto previous = time();
+  auto shouldPublish = false;
+  while (true) {
+    auto signal = publish(spins++, shouldPublish, previous, waiter, waitMode);
+
+    // if we got skipped, make a note of it and return if we got a skipped
+    // signal or a signal to wake up
+    auto skipped = (signal == kSkipped);
+    auto combined = (signal == kCombined);
+    auto exceptionOccurred = (signal == kExceptionOccurred);
+    auto woken = (signal == kWake);
+    if (skipped || woken || combined || exceptionOccurred) {
+      sig = static_cast<std::uint32_t>(signal);
+      return !skipped;
+    }
+
+    // if we are under the spin threshold, pause to allow the other
+    // hyperthread to run.  If not, then sleep
+    if (spins < kMaxSpins) {
+      asm_volatile_pause();
+    } else {
+      Sleeper::sleep();
+    }
+  }
+}
+
+template <typename Waiter>
+void doFutexWake(Waiter* waiter) {
+  if (waiter) {
+    // We can use a simple store operation here and not worry about checking
+    // to see if the thread had actually started waiting on the futex, that is
+    // already done in tryWake() when a sleeping thread is collected
+    //
+    // We now do not know whether the waiter had already enqueued on the futex
+    // or whether it had just stored kSleeping in its futex and was about to
+    // call futexWait().  We treat both these scenarios the same
+    //
+    // the below can theoretically cause a problem if we set the
+    // wake signal and the waiter was in between setting kSleeping in its
+    // futex and enqueueing on the futex.  In this case the waiter will just
+    // return from futexWait() immediately.  This leaves the address that the
+    // waiter was using for futexWait() possibly dangling, and the thread that
+    // we woke in the exchange above might have used that address for some
+    // other object
+    //
+    // however, even if the thread had indeed woken up simply becasue of the
+    // above exchange(), the futexWake() below is not incorrect.  It is not
+    // incorrect because futexWake() does not actually change the memory of
+    // the futex word.  It just uses the address to do a lookup in the kernel
+    // futex table.  And even if we call futexWake() on some other address,
+    // and that address was being used to wait on futex() that thread will
+    // protect itself from spurious wakeups, check the value in the futex word
+    // and enqueue itself back on the futex
+    //
+    // this dangilng pointer possibility is why we use a pointer to the futex
+    // word, and avoid dereferencing after the store() operation
+    auto sleeper = &waiter->metadata_.sleeper_;
+    sleeper->store(kWake, std::memory_order_release);
+    futexWake(sleeper, 1);
+  }
+}
+
+template <typename Waiter>
+bool doFutexWait(Waiter* waiter, Waiter*& next) {
+  // first we get ready to sleep by calling exchange() on the futex with a
+  // kSleeping value
+  assert(waiter->futex_.load(std::memory_order_relaxed) == kAboutToWait);
+
+  // note the semantics of using a futex here, when we exchange the sleeper_
+  // with kSleeping, we are getting ready to sleep, but before sleeping we get
+  // ready to sleep, and we return from futexWait() when the value of
+  // sleeper_ might have changed.  We can also wake up because of a spurious
+  // wakeup, so we always check against the value in sleeper_ after returning
+  // from futexWait(), if the value is not kWake, then we continue
+  auto pre =
+      waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
+
+  // Seeing a kSleeping on a futex word before we set it ourselves means only
+  // one thing - an unlocking thread caught us before we went to futex(), and
+  // we now have the lock, so we abort
+  //
+  // if we were given an early delivery, we can return from this function with
+  // a true, meaning that we now have the lock
+  if (pre == kSleeping) {
+    return true;
+  }
+
+  // if we reach here then were were not given an early delivery, and any
+  // thread that goes to wake us up will see a consistent view of the rest of
+  // the contention chain (since the next_ variable is set before the
+  // kSleeping exchange above)
+  while (pre != kWake) {
+    // before enqueueing on the futex, we wake any waiters that we were
+    // possibly responsible for
+    doFutexWake(folly::exchange(next, nullptr));
+
+    // then we wait on the futex
+    //
+    // note that we have to protect ourselves against spurious wakeups here.
+    // Because the corresponding futexWake() above does not synchronize
+    // wakeups around the futex word.  Because doing so would become
+    // inefficient
+    futexWait(&waiter->metadata_.sleeper_, kSleeping);
+    pre = waiter->metadata_.sleeper_.load(std::memory_order_acquire);
+    assert((pre == kSleeping) || (pre == kWake));
+  }
+
+  // when coming out of a futex, we might have some other sleeping threads
+  // that we were supposed to wake up, assign that to the next pointer
+  assert(next == nullptr);
+  next = extractPtr<Waiter>(waiter->next_.load(std::memory_order_relaxed));
+  return false;
+}
+
+template <typename Waiter>
+bool wait(Waiter* waiter, std::uint32_t mode, Waiter*& next, uint32_t& signal) {
+  if (mode == kAboutToWait) {
+    return doFutexWait(waiter, next);
+  }
+
+  return spin(*waiter, signal, mode);
+}
+
+inline void recordTimedWaiterAndClearTimedBit(
+    bool& timedWaiter,
+    std::uintptr_t& previous) {
+  // the previous value in the mutex can never be kTimedWaiter, timed waiters
+  // always set (kTimedWaiter | kLocked) in the mutex word when they try and
+  // acquire the mutex
+  assert(previous != kTimedWaiter);
+
+  if ((previous & kTimedWaiter)) {
+    // record whether there was a timed waiter in the previous mutex state, and
+    // clear the timed bit from the previous state
+    timedWaiter = true;
+    previous = previous & (~kTimedWaiter);
+  }
+}
+
+template <typename Atomic>
+void wakeTimedWaiters(Atomic* state, bool timedWaiters) {
+  if ((timedWaiters)) {
+    atomic_notify_one(state);
+  }
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+template <typename Func>
+auto DistributedMutex<Atomic, TimePublishing>::lock_combine(Func func)
+    -> decltype(std::declval<const Func&>()()) {
+  // invoke the lock implementation function and check whether we came out of
+  // it with our task executed as a combined critical section.  This usually
+  // happens when the mutex is contended.
+  //
+  // In the absence of contention, we just return from the try_lock() function
+  // with the lock acquired.  So we need to invoke the task and unlock
+  // the mutex before returning
+  auto&& task = Request<Func>{func};
+  auto&& state = lockImplementation(*this, state_, task);
+  if (!state.combined_) {
+    // to avoid having to play a return-value dance when the combinable
+    // returns void, we use a scope exit to perform the unlock after the
+    // function return has been processed
+    SCOPE_EXIT {
+      unlock(std::move(state));
+    };
+    return func();
+  }
+
+  // if we are here, that means we were able to get our request combined, we
+  // can return the value that was transferred to us
+  //
+  // each thread that enqueues as a part of a contention chain takes up the
+  // responsibility of any timed waiter that had come immediately before it,
+  // so we wake up timed waiters before exiting the lock function.  Another
+  // strategy might be to add the timed waiter information to the metadata and
+  // let a single leader wake up a timed waiter for better concurrency.  But
+  // this has proven not to be useful in benchmarks beyond a small 5% delta,
+  // so we avoid taking the complexity hit and branch to wake up timed waiters
+  // from each thread
+  wakeTimedWaiters(&state_, state.timedWaiters_);
+  return std::move(task).get();
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+DistributedMutex<Atomic, TimePublishing>::lock() {
+  auto null = nullptr;
+  return lockImplementation(*this, state_, null);
+}
+
+template <typename Atomic, template <typename> class A, bool T>
+auto tryLockNoLoad(Atomic& atomic, DistributedMutex<A, T>&)
+    -> typename DistributedMutex<A, T>::DistributedMutexStateProxy {
+  // Try and set the least significant bit of the centralized lock state to 1,
+  // if this succeeds, it must have been the case that we had a kUnlocked (or
+  // 0) in the central storage before, since that is the only case where a 0
+  // can be found in the least significant bit
+  //
+  // If this fails, then it is a no-op
+  using Proxy = typename DistributedMutex<A, T>::DistributedMutexStateProxy;
+  auto previous = atomic_fetch_set(atomic, 0, std::memory_order_acquire);
+  if (!previous) {
+    return Proxy{nullptr, kLocked};
+  }
+
+  return Proxy{nullptr, 0};
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+DistributedMutex<Atomic, TimePublishing>::try_lock() {
+  // The lock attempt below requires an expensive atomic fetch-and-mutate or
+  // an even more expensive atomic compare-and-swap loop depending on the
+  // platform.  These operations require pulling the lock cacheline into the
+  // current core in exclusive mode and are therefore hard to parallelize
+  //
+  // This probabilistically avoids the expense by first checking whether the
+  // mutex is currently locked
+  if (state_.load(std::memory_order_relaxed) != kUnlocked) {
+    return DistributedMutexStateProxy{nullptr, 0};
+  }
+
+  return tryLockNoLoad(state_, *this);
+}
+
+template <
+    template <typename> class Atomic,
+    bool TimePublishing,
+    typename State,
+    typename Request>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+lockImplementation(
+    DistributedMutex<Atomic, TimePublishing>& mutex,
+    State& atomic,
+    Request& request) {
+  // first try and acquire the lock as a fast path, the underlying
+  // implementation is slightly faster than using std::atomic::exchange() as
+  // is used in this function.  So we get a small perf boost in the
+  // uncontended case
+  //
+  // We only go through this fast path for the lock/unlock usage and avoid this
+  // for combined critical sections.  This check adds unnecessary overhead in
+  // that case as it causes an extra cacheline bounce
+  constexpr auto combineRequested = !std::is_same<Request, std::nullptr_t>{};
+  if (!combineRequested) {
+    if (auto state = tryLockNoLoad(atomic, mutex)) {
+      return state;
+    }
+  }
+
+  auto previous = std::uintptr_t{0};
+  auto waitMode = combineRequested ? kCombineUninitialized : kUninitialized;
+  auto nextWaitMode = kAboutToWait;
+  auto timedWaiter = false;
+  Waiter<Atomic>* nextSleeper = nullptr;
+  while (true) {
+    // construct the state needed to wait
+    //
+    // We can't use auto here because MSVC errors out due to a missing copy
+    // constructor
+    Waiter<Atomic> state{};
+    auto&& task = coalesce(request, state);
+    auto&& storage = makeReturnValueStorageFor(task);
+    auto&& address = folly::bit_cast<std::uintptr_t>(&state);
+    attach(task, storage);
+    state.initialize(waitMode, std::move(task));
+    assert(!(address & 0b1));
+
+    // set the locked bit in the address we will be persisting in the mutex
+    address |= kLocked;
+
+    // attempt to acquire the mutex, mutex acquisition is successful if the
+    // previous value is zeroed out
+    //
+    // we use memory_order_acq_rel here because we want the read-modify-write
+    // operation to be both acquire and release.  Acquire becasue if this is a
+    // successful lock acquisition, we want to acquire state any other thread
+    // has released from a prior unlock.  We want release semantics becasue
+    // other threads that read the address of this value should see the full
+    // well-initialized node we are going to wait on if the mutex acquisition
+    // was unsuccessful
+    previous = atomic.exchange(address, std::memory_order_acq_rel);
+    recordTimedWaiterAndClearTimedBit(timedWaiter, previous);
+    state.next_.store(previous, std::memory_order_relaxed);
+    if (previous == kUnlocked) {
+      return {/* next */ nullptr,
+              /* expected */ address,
+              /* timedWaiter */ timedWaiter,
+              /* combined */ false,
+              /* waker */ 0,
+              /* waiters */ nullptr,
+              /* ready */ nextSleeper};
+    }
+    assert(previous & kLocked);
+
+    // wait until we get a signal from another thread, if this returns false,
+    // we got skipped and had probably been scheduled out, so try again
+    auto signal = kUninitialized;
+    if (!wait(&state, waitMode, nextSleeper, signal)) {
+      std::swap(waitMode, nextWaitMode);
+      continue;
+    }
+
+    // at this point it is safe to access the other fields in the waiter state,
+    // since the thread that woke us up is gone and nobody will be touching this
+    // state again, note that this requires memory ordering, and this is why we
+    // use memory_order_acquire (among other reasons) in the above wait
+    //
+    // first we see if the value we took off the mutex state was the thread that
+    // initated the wakeups, if so, we are the terminal node of the current
+    // contention chain.  If we are the terminal node, then we should expect to
+    // see a kLocked in the mutex state when we unlock, if we see that, we can
+    // commit the unlock to the centralized mutex state.  If not, we need to
+    // continue wakeups
+    //
+    // a nice consequence of passing kLocked as the current address if we are
+    // the terminal node is that it naturally just works with the algorithm.  If
+    // we get a contention chain when coming out of a contention chain, the tail
+    // of the new contention chain will have kLocked set as the previous, which,
+    // as it happens "just works", since we have now established a recursive
+    // relationship until broken
+    auto next = previous;
+    auto expected = address;
+    if (previous == state.metadata_.waker_) {
+      next = 0;
+      expected = kLocked;
+    }
+
+    // if we were given a combine signal, detach the return value from the
+    // wait struct into the request, so the current thread can access it
+    // outside this function
+    auto combined = (signal == kCombined);
+    auto exceptionOccurred = (signal == kExceptionOccurred);
+    if (combined || exceptionOccurred) {
+      detach(request, state, exceptionOccurred, storage);
+    }
+
+    // if we are just coming out of a futex call, then it means that the next
+    // waiter we are responsible for is also a waiter waiting on a futex, so
+    // we return that list in the list of ready threads.  We wlil be waking up
+    // the ready threads on unlock no matter what
+    return {/* next */ extractPtr<Waiter<Atomic>>(next),
+            /* expected */ expected,
+            /* timedWaiter */ timedWaiter,
+            /* combined */ combineRequested && (combined || exceptionOccurred),
+            /* waker */ state.metadata_.waker_,
+            /* waiters */ extractPtr<Waiter<Atomic>>(state.metadata_.waiters_),
+            /* ready */ nextSleeper};
+  }
+}
+
+inline bool preempted(std::uint64_t value, std::chrono::nanoseconds now) {
+  auto currentTime = recover(strip(now));
+  auto nodeTime = recover(value);
+  auto preempted =
+      (currentTime > nodeTime + kScheduledAwaySpinThreshold.count()) &&
+      (nodeTime != recover(strip(std::chrono::nanoseconds::max())));
+
+  // we say that the thread has been preempted if its timestamp says so, and
+  // also if it is neither uninitialized nor skipped
+  assert(value != kSkipped);
+  return (preempted) && (value != kUninitialized) &&
+      (value != kCombineUninitialized);
+}
+
+inline bool isSleeper(std::uintptr_t value) {
+  return (value == kAboutToWait);
+}
+
+inline bool isInitialized(std::uintptr_t value) {
+  return (value != kUninitialized) && (value != kCombineUninitialized);
+}
+
+inline bool isCombiner(std::uintptr_t value) {
+  auto mode = (value & 0xff);
+  return (mode == kCombineWaiting) || (mode == kCombineUninitialized);
+}
+
+inline bool isWaitingCombiner(std::uintptr_t value) {
+  return (value & 0xff) == kCombineWaiting;
+}
+
+template <typename Waiter>
+CombineFunction loadTask(Waiter* current, std::uintptr_t value) {
+  // if we know that the waiter is a combiner of some sort, it is safe to read
+  // and copy the value of the function in the waiter struct, since we know
+  // that a waiter would have set it before enqueueing
+  if (isCombiner(value)) {
+    return current->function_;
+  }
+
+  return nullptr;
+}
+
+template <typename Waiter>
+void transferCurrentException(Waiter* waiter) {
+  assert(std::current_exception());
+  new (&waiter->storage_) std::exception_ptr(std::current_exception());
+  waiter->futex_.store(kExceptionOccurred, std::memory_order_release);
+}
+
+template <template <typename> class Atomic>
+inline std::uintptr_t tryCombine(
+    Waiter<Atomic>* waiter,
+    std::uintptr_t value,
+    std::uintptr_t next,
+    std::uint64_t iteration,
+    std::chrono::nanoseconds now,
+    CombineFunction task) {
+#ifndef ROCKSDB_LITE
+  // if the waiter has asked for a combine operation, we should combine its
+  // critical section and move on to the next waiter
+  //
+  // the waiter is combinable if the following conditions are satisfied
+  //
+  //  1) the state in the futex word is not uninitialized (kUninitialized)
+  //  2) it has a valid combine function
+  //  3) we are not past the limit of the number of combines we can perform
+  //     or the waiter thread been preempted.  If the waiter gets preempted,
+  //     its better to just execute their critical section before moving on.
+  //     As they will have to re-queue themselves after preemption anyway,
+  //     leading to further delays in critical section completion
+  //
+  // if all the above are satisfied, then we can combine the critical section.
+  // Note that if the waiter is in a combineable state, that means that it had
+  // finished its writes to both the task and the next_ value.  And observing
+  // a waiting state also means that we have acquired the writes to the other
+  // members of the waiter struct, so it's fine to use those values here
+  if (isWaitingCombiner(value) &&
+      (iteration <= kMaxCombineIterations || preempted(value, now))) {
+    try {
+      task();
+      waiter->futex_.store(kCombined, std::memory_order_release);
+    } catch (...) {
+      transferCurrentException(waiter);
+    }
+    return next;
+  }
+#endif  // ROCKSDB_LITE
+  return 0;
+}
+
+template <typename Waiter>
+inline std::uintptr_t tryWake(
+    bool publishing,
+    Waiter* waiter,
+    std::uintptr_t value,
+    std::uintptr_t next,
+    std::uintptr_t waker,
+    Waiter*& sleepers,
+    std::uint64_t iteration,
+    CombineFunction task) {
+  // try and combine the waiter's request first, if that succeeds that means
+  // we have successfully executed their critical section and can move on to
+  // the rest of the chain
+  auto now = time();
+  if (tryCombine(waiter, value, next, iteration, now, task)) {
+    return next;
+  }
+
+  // first we see if we can wake the current thread that is spinning
+  if ((!publishing || !preempted(value, now)) && !isSleeper(value)) {
+    // the Metadata class should be trivially destructible as we use placement
+    // new to set the relevant metadata without calling any destructor.  We
+    // need to use placement new because the class contains a futex, which is
+    // non-movable and non-copyable
+    using Metadata = _t<std::decay<decltype(waiter->metadata_)>>;
+    static_assert(std::is_trivially_destructible<Metadata>{}, "");
+
+    // we need release here because of the write to waker_ and also because we
+    // are unlocking the mutex, the thread we do the handoff to here should
+    // see the modified data
+    new (&waiter->metadata_) Metadata(waker, bit_cast<uintptr_t>(sleepers));
+    waiter->futex_.store(kWake, std::memory_order_release);
+    return 0;
+  }
+
+  // if the thread is not a sleeper, and we were not able to catch it before
+  // preemption, we can just return a false, it is safe to read next_ because
+  // the thread was preempted.  Preemption signals can only come after the
+  // thread has set the next_ pointer, since the timestamp writes only start
+  // occurring after that point
+  //
+  // if a thread was preempted it must have stored next_ in the waiter struct,
+  // as the store to futex_ that resets the value from kUninitialized happens
+  // after the write to next
+  assert(publishing);
+  if (!isSleeper(value)) {
+    // go on to the next one
+    //
+    // Also, we need a memory_order_release here to prevent missed wakeups.  A
+    // missed wakeup here can happen when we see that a thread had been
+    // preempted and skip it.  Then go on to release the lock, and then when
+    // the thread which got skipped does an exchange on the central storage,
+    // still sees the locked bit, and never gets woken up
+    //
+    // Can we relax this?
+    assert(preempted(value, now));
+    assert(!isCombiner(value));
+    next = waiter->next_.load(std::memory_order_relaxed);
+    waiter->futex_.store(kSkipped, std::memory_order_release);
+    return next;
+  }
+
+  // if we are here the thread is a sleeper
+  //
+  // we attempt to catch the thread before it goes to futex().  If we are able
+  // to catch the thread before it sleeps on a futex, we are done, and don't
+  // need to go any further
+  //
+  // if we are not able to catch the thread before it goes to futex, we
+  // collect the current thread in the list of sleeping threads represented by
+  // sleepers, and return the next thread in the list and return false along
+  // with the previous next value
+  //
+  // it is safe to read the next_ pointer in the waiter struct if we were
+  // unable to catch the thread before it went to futex() because we use
+  // acquire-release ordering for the exchange operation below.  And if we see
+  // that the thread was already sleeping, we have synchronized with the write
+  // to next_ in the context of the sleeping thread
+  //
+  // Also we need to set the value of waiters_ and waker_ in the thread before
+  // doing the exchange because we need to pass on the list of sleepers in the
+  // event that we were able to catch the thread before it went to futex().
+  // If we were unable to catch the thread before it slept, these fields will
+  // be ignored when the thread wakes up anyway
+  assert(isSleeper(value));
+  waiter->metadata_.waker_ = waker;
+  waiter->metadata_.waiters_ = folly::bit_cast<std::uintptr_t>(sleepers);
+  auto pre =
+      waiter->metadata_.sleeper_.exchange(kSleeping, std::memory_order_acq_rel);
+
+  // we were able to catch the thread before it went to sleep, return true
+  if (pre != kSleeping) {
+    return 0;
+  }
+
+  // otherwise return false, with the value of next_, it is safe to read next
+  // because of the same logic as when a thread was preempted
+  //
+  // we also need to collect this sleeper in the list of sleepers being built
+  // up
+  next = waiter->next_.load(std::memory_order_relaxed);
+  auto head = folly::bit_cast<std::uintptr_t>(sleepers);
+  waiter->next_.store(head, std::memory_order_relaxed);
+  sleepers = waiter;
+  return next;
+}
+
+template <typename Waiter>
+bool wake(
+    bool publishing,
+    Waiter& waiter,
+    std::uintptr_t waker,
+    Waiter*& sleepers,
+    std::uint64_t iter) {
+  // loop till we find a node that is either at the end of the list (as
+  // specified by waker) or we find a node that is active (as specified by
+  // the last published timestamp of the node)
+  auto current = &waiter;
+  while (current) {
+    // it is important that we load the value of function and next_ after the
+    // initial acquire load.  This is required because we need to synchronize
+    // with the construction of the waiter struct before reading from it
+    //
+    // the load from the next_ variable is an optimistic load that assumes
+    // that the waiting thread has probably gone to the waiting state.  If the
+    // waiitng thread is in the waiting state (as revealed by the acquire load
+    // from the futex word), we will see a well formed next_ value because it
+    // happens-before the release store to the futex word.  The atomic load from
+    // next_ is an optimization to avoid branching before loading and prevent
+    // the compiler from eliding the load altogether (and using a pointer
+    // dereference when needed)
+    auto value = current->futex_.load(std::memory_order_acquire);
+    auto next = current->next_.load(std::memory_order_relaxed);
+    auto task = loadTask(current, value);
+    next =
+        tryWake(publishing, current, value, next, waker, sleepers, iter, task);
+
+    // if there is no next node, we have managed to wake someone up and have
+    // successfully migrated the lock to another thread
+    if (!next) {
+      return true;
+    }
+
+    // we need to read the value of the next node in the list before skipping
+    // it, this is because after we skip it the node might wake up and enqueue
+    // itself, and thereby gain a new next node
+    assert(publishing);
+    current = (next == waker) ? nullptr : extractPtr<Waiter>(next);
+  }
+
+  return false;
+}
+
+template <typename Atomic, typename Proxy, typename Sleepers>
+bool tryUnlockClean(Atomic& state, Proxy& proxy, Sleepers sleepers) {
+  auto expected = proxy.expected_;
+  while (true) {
+    if (state.compare_exchange_strong(
+            expected,
+            kUnlocked,
+            std::memory_order_release,
+            std::memory_order_relaxed)) {
+      // if we were able to commit an unlocked, we need to wake up the futex
+      // waiters, if any
+      doFutexWake(sleepers);
+      return true;
+    }
+
+    // if we failed the compare_exchange_strong() above, we check to see if
+    // the failure was because of the presence of a timed waiter.  If that
+    // was the case then we try one more time with the kTimedWaiter bit set
+    if (expected == (proxy.expected_ | kTimedWaiter)) {
+      proxy.timedWaiters_ = true;
+      continue;
+    }
+
+    // otherwise break, we have a contention chain
+    return false;
+  }
+}
+
+template <template <typename> class Atomic, bool Publish>
+void DistributedMutex<Atomic, Publish>::unlock(
+    DistributedMutex::DistributedMutexStateProxy proxy) {
+  // we always wake up ready threads and timed waiters if we saw either
+  assert(proxy);
+  assert(!proxy.combined_);
+  SCOPE_EXIT {
+    doFutexWake(proxy.ready_);
+    wakeTimedWaiters(&state_, proxy.timedWaiters_);
+  };
+
+  // if there is a wait queue we are responsible for, try and start wakeups,
+  // don't bother with the mutex state
+  auto sleepers = proxy.waiters_;
+  if (proxy.next_) {
+    if (wake(Publish, *proxy.next_, proxy.waker_, sleepers, 0)) {
+      return;
+    }
+
+    // At this point, if are in the if statement, we were not the terminal
+    // node of the wakeup chain.  Terminal nodes have the next_ pointer set to
+    // null in lock()
+    //
+    // So we need to pretend we were the end of the contention chain.  Coming
+    // out of a contention chain always has the kLocked state set in the
+    // mutex.  Unless there is another contention chain lined up, which does
+    // not matter since we are the terminal node anyway
+    proxy.expected_ = kLocked;
+  }
+
+  for (std::uint64_t i = 0; true; ++i) {
+    // otherwise, since we don't have anyone we need to wake up, we try and
+    // release the mutex just as is
+    //
+    // if this is successful, we can return, the unlock was successful, we have
+    // committed a nice kUnlocked to the central storage, yay
+    if (tryUnlockClean(state_, proxy, sleepers)) {
+      return;
+    }
+
+    // here we have a contention chain built up on the mutex.  We grab the
+    // wait queue and start executing wakeups.  We leave a locked bit on the
+    // centralized storage and handoff control to the head of the queue
+    //
+    // we use memory_order_acq_rel here because we want to see the
+    // full well-initialized node that the other thread is waiting on
+    //
+    // If we are unable to wake the contention chain, it is possible that when
+    // we come back to looping here, a new contention chain will form.  In
+    // that case we need to use kLocked as the waker_ value because the
+    // terminal node of the new chain will see kLocked in the central storage
+    auto head = state_.exchange(kLocked, std::memory_order_acq_rel);
+    recordTimedWaiterAndClearTimedBit(proxy.timedWaiters_, head);
+    auto next = extractPtr<Waiter<Atomic>>(head);
+    auto expected = folly::exchange(proxy.expected_, kLocked);
+    assert((head & kLocked) && (head != kLocked));
+    if (wake(Publish, *next, expected, sleepers, i)) {
+      break;
+    }
+  }
+}
+
+template <typename Atomic, typename Deadline, typename MakeProxy>
+auto timedLock(Atomic& state, Deadline deadline, MakeProxy proxy)
+    -> decltype(std::declval<MakeProxy&>()(nullptr, kLocked, true)) {
+  while (true) {
+    // we put a bit on the central state to show that there is a timed waiter
+    // and go to sleep on the central state
+    //
+    // when this thread goes to unlock the mutex, it will expect a 0b1 in the
+    // mutex state (0b1, not 0b11), but then it will see that the value in the
+    // mutex state is 0b11 and not 0b1, meaning that there might have been
+    // another timed waiter.  Even though there might not have been another
+    // timed waiter in the time being.  This sort of missed wakeup is
+    // desirable for timed waiters; it helps avoid thundering herds of timed
+    // waiters.  Because the mutex is packed in 8 bytes, and we need an
+    // address to be stored in those 8 bytes, we don't have much room to play
+    // with.  The only other solution is to issue a futexWake(INT_MAX) to wake
+    // up all waiters when a clean unlock is committed, when a thread saw a
+    // timed waiter in the mutex previously.
+    //
+    // putting a 0b11 here works for a set of reasons that is a superset of
+    // the set of reasons that make it okay to put a kLocked (0b1) in the
+    // mutex state.  Now that the thread has put (kTimedWaiter | kLocked)
+    // (0b11) in the mutex state and it expects a kLocked (0b1), there are two
+    // scenarios possible.  The first being when there is no contention chain
+    // formation in the mutex from the time a timed waiter got a lock to
+    // unlock.  In this case, the unlocker sees a 0b11 in the mutex state,
+    // adjusts to the presence of a timed waiter and cleanly unlocks with a
+    // kUnlocked (0b0).  The second is when there is a contention chain.
+    // When a thread puts its address in the mutex and sees the timed bit, it
+    // records the presence of a timed waiter, and then pretends as if it
+    // hadn't seen the timed bit.  So future contention chain releases, will
+    // terminate with a kLocked (0b1) and not a (kLocked | kTimedWaiter)
+    // (0b11).  This just works naturally with the rest of the algorithm
+    // without incurring a perf hit for the regular non-timed case
+    //
+    // this strategy does however mean, that when threads try to acquire the
+    // mutex and all time out, there will be a wasteful syscall to issue wakeups
+    // to waiting threads.  We don't do anything to try and minimize this
+    //
+    // we need to use a fetch_or() here because we need to convey two bits of
+    // information - 1, whether the mutex is locked or not, and 2, whether
+    // there is a timed waiter.  The alternative here is to use the second bit
+    // to convey information only, we can use a fetch_set() on the second bit
+    // to make this faster, but that comes at the expense of requiring regular
+    // fast path lock attempts.  Which use a single bit read-modify-write for
+    // better performance
+    auto data = kTimedWaiter | kLocked;
+    auto previous = state.fetch_or(data, std::memory_order_acquire);
+    if (!(previous & 0b1)) {
+      assert(!previous);
+      return proxy(nullptr, kLocked, true);
+    }
+
+    // wait on the futex until signalled, if we get a timeout, the try_lock
+    // fails
+    auto result = atomic_wait_until(&state, previous | data, deadline);
+    if (result == std::cv_status::timeout) {
+      return proxy(nullptr, std::uintptr_t{0}, false);
+    }
+  }
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+template <typename Clock, typename Duration>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+DistributedMutex<Atomic, TimePublishing>::try_lock_until(
+    const std::chrono::time_point<Clock, Duration>& deadline) {
+  // fast path for the uncontended case
+  //
+  // we get the time after trying to acquire the mutex because in the
+  // uncontended case, the price of getting the time is about 1/3 of the
+  // actual mutex acquisition.  So we only pay the price of that extra bit of
+  // latency when needed
+  //
+  // this is even higher when VDSO is involved on architectures that do not
+  // offer a direct interface to the timestamp counter
+  if (auto state = try_lock()) {
+    return state;
+  }
+
+  // fall back to the timed locking algorithm
+  using Proxy = DistributedMutexStateProxy;
+  return timedLock(
+      state_,
+      deadline,
+      [](Waiter<Atomic>* next, std::uintptr_t expected, bool timedWaiter) {
+        return Proxy{next, expected, timedWaiter};
+      });
+}
+
+template <template <typename> class Atomic, bool TimePublishing>
+template <typename Rep, typename Period>
+typename DistributedMutex<Atomic, TimePublishing>::DistributedMutexStateProxy
+DistributedMutex<Atomic, TimePublishing>::try_lock_for(
+    const std::chrono::duration<Rep, Period>& duration) {
+  // fast path for the uncontended case.  Reasoning for doing this here is the
+  // same as in try_lock_until()
+  if (auto state = try_lock()) {
+    return state;
+  }
+
+  // fall back to the timed locking algorithm
+  using Proxy = DistributedMutexStateProxy;
+  auto deadline = std::chrono::steady_clock::now() + duration;
+  return timedLock(
+      state_,
+      deadline,
+      [](Waiter<Atomic>* next, std::uintptr_t expected, bool timedWaiter) {
+        return Proxy{next, expected, timedWaiter};
+      });
+}
+} // namespace distributed_mutex
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/DistributedMutex.cpp b/third-party/folly/folly/synchronization/DistributedMutex.cpp
new file mode 100644
index 00000000000..28684ff2974
--- /dev/null
+++ b/third-party/folly/folly/synchronization/DistributedMutex.cpp
@@ -0,0 +1,16 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/DistributedMutex.h>
+
+namespace folly {
+namespace detail {
+namespace distributed_mutex {
+
+template class DistributedMutex<std::atomic, true>;
+
+} // namespace distributed_mutex
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/DistributedMutex.h b/third-party/folly/folly/synchronization/DistributedMutex.h
new file mode 100644
index 00000000000..7acf04f458b
--- /dev/null
+++ b/third-party/folly/folly/synchronization/DistributedMutex.h
@@ -0,0 +1,304 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <cstdint>
+
+namespace folly {
+namespace detail {
+namespace distributed_mutex {
+
+/**
+ * DistributedMutex is a small, exclusive-only mutex that distributes the
+ * bookkeeping required for mutual exclusion in the stacks of threads that are
+ * contending for it.  It has a mode that can combine critical sections when
+ * the mutex experiences contention; this allows the implementation to elide
+ * several expensive coherence and synchronization operations to boost
+ * throughput, surpassing even atomic instructions in some cases.  It has a
+ * smaller memory footprint than std::mutex, a similar level of fairness
+ * (better in some cases) and no dependencies on heap allocation.  It is the
+ * same width as a single pointer (8 bytes on most platforms), where on the
+ * other hand, std::mutex and pthread_mutex_t are both 40 bytes.  It is larger
+ * than some of the other smaller locks, but the wide majority of cases using
+ * the small locks are wasting the difference in alignment padding anyway
+ *
+ * Benchmark results are good - at the time of writing, in the contended case,
+ * for lock/unlock based critical sections, it is about 4-5x faster than the
+ * smaller locks and about ~2x faster than std::mutex.  When used in
+ * combinable mode, it is much faster than the alternatives, going more than
+ * 10x faster than the small locks, about 6x faster than std::mutex, 2-3x
+ * faster than flat combining and even faster than std::atomic<> in some
+ * cases, allowing more work with higher throughput.  In the uncontended case,
+ * it is a few cycles faster than folly::MicroLock but a bit slower than
+ * std::mutex.  DistributedMutex is also resistent to tail latency pathalogies
+ * unlike many of the other mutexes in use, which sleep for large time
+ * quantums to reduce spin churn, this causes elevated latencies for threads
+ * that enter the sleep cycle.  The tail latency of lock acquisition can go up
+ * to 10x lower because of a more deterministic scheduling algorithm that is
+ * managed almost entirely in userspace.  Detailed results comparing the
+ * throughput and latencies of different mutex implementations and atomics are
+ * at the bottom of folly/synchronization/test/SmallLocksBenchmark.cpp
+ *
+ * Theoretically, write locks promote concurrency when the critical sections
+ * are small as most of the work is done outside the lock.  And indeed,
+ * performant concurrent applications go through several pains to limit the
+ * amount of work they do while holding a lock.  However, most times, the
+ * synchronization and scheduling overhead of a write lock in the critical
+ * path is so high, that after a certain point, making critical sections
+ * smaller does not actually increase the concurrency of the application and
+ * throughput plateaus.  DistributedMutex moves this breaking point to the
+ * level of hardware atomic instructions, so applications keep getting
+ * concurrency even under very high contention.  It does this by reducing
+ * cache misses and contention in userspace and in the kernel by making each
+ * thread wait on a thread local node and futex.  When combined critical
+ * sections are used DistributedMutex leverages template metaprogramming to
+ * allow the mutex to make better synchronization decisions based on the
+ * layout of the input and output data.  This allows threads to keep working
+ * only on their own cache lines without requiring cache coherence operations
+ * when a mutex experiences heavy contention
+ *
+ * Non-timed mutex acquisitions are scheduled through intrusive LIFO
+ * contention chains.  Each thread starts by spinning for a short quantum and
+ * falls back to two phased sleeping.  Enqueue operations are lock free and
+ * are piggybacked off mutex acquisition attempts.  The LIFO behavior of a
+ * contention chain is good in the case where the mutex is held for a short
+ * amount of time, as the head of the chain is likely to not have slept on
+ * futex() after exhausting its spin quantum.  This allow us to avoid
+ * unnecessary traversal and syscalls in the fast path with a higher
+ * probability.  Even though the contention chains are LIFO, the mutex itself
+ * does not adhere to that scheduling policy globally.  During contention,
+ * threads that fail to lock the mutex form a LIFO chain on the central mutex
+ * state, this chain is broken when a wakeup is scheduled, and future enqueue
+ * operations form a new chain.  This makes the chains themselves LIFO, but
+ * preserves global fairness through a constant factor which is limited to the
+ * number of concurrent failed mutex acquisition attempts.  This binds the
+ * last in first out behavior to the number of contending threads and helps
+ * prevent starvation and latency outliers
+ *
+ * This strategy of waking up wakers one by one in a queue does not scale well
+ * when the number of threads goes past the number of cores.  At which point
+ * preemption causes elevated lock acquisition latencies.  DistributedMutex
+ * implements a hardware timestamp publishing heuristic to detect and adapt to
+ * preemption.
+ *
+ * DistributedMutex does not have the typical mutex API - it does not satisfy
+ * the Lockable concept.  It requires the user to maintain ephemeral bookkeeping
+ * and pass that bookkeeping around to unlock() calls.  The API overhead,
+ * however, comes for free when you wrap this mutex for usage with
+ * std::unique_lock, which is the recommended usage (std::lock_guard, in
+ * optimized mode, has no performance benefit over std::unique_lock, so has been
+ * omitted).  A benefit of this API is that it disallows incorrect usage where a
+ * thread unlocks a mutex that it does not own, thinking a mutex is functionally
+ * identical to a binary semaphore, which, unlike a mutex, is a suitable
+ * primitive for that usage
+ *
+ * Combined critical sections allow the implementation to elide several
+ * expensive operations during the lifetime of a critical section that cause
+ * slowdowns with regular lock/unlock based usage.  DistributedMutex resolves
+ * contention through combining up to a constant factor of 2 contention chains
+ * to prevent issues with fairness and latency outliers, so we retain the
+ * fairness benefits of the lock/unlock implementation with no noticeable
+ * regression when switching between the lock methods.  Despite the efficiency
+ * benefits, combined critical sections can only be used when the critical
+ * section does not depend on thread local state and does not introduce new
+ * dependencies between threads when the critical section gets combined.  For
+ * example, locking or unlocking an unrelated mutex in a combined critical
+ * section might lead to unexpected results or even undefined behavior.  This
+ * can happen if, for example, a different thread unlocks a mutex locked by
+ * the calling thread, leading to undefined behavior as the mutex might not
+ * allow locking and unlocking from unrelated threads (the posix and C++
+ * standard disallow this usage for their mutexes)
+ *
+ * Timed locking through DistributedMutex is implemented through a centralized
+ * algorithm.  The underlying contention-chains framework used in
+ * DistributedMutex is not abortable so we build abortability on the side.
+ * All waiters wait on the central mutex state, by setting and resetting bits
+ * within the pointer-length word.  Since pointer length atomic integers are
+ * incompatible with futex(FUTEX_WAIT) on most systems, a non-standard
+ * implementation of futex() is used, where wait queues are managed in
+ * user-space (see p1135r0 and folly::ParkingLot for more)
+ */
+template <
+    template <typename> class Atomic = std::atomic,
+    bool TimePublishing = true>
+class DistributedMutex {
+ public:
+  class DistributedMutexStateProxy;
+
+  /**
+   * DistributedMutex is only default constructible, it can neither be moved
+   * nor copied
+   */
+  DistributedMutex();
+  DistributedMutex(DistributedMutex&&) = delete;
+  DistributedMutex(const DistributedMutex&) = delete;
+  DistributedMutex& operator=(DistributedMutex&&) = delete;
+  DistributedMutex& operator=(const DistributedMutex&) = delete;
+
+  /**
+   * Acquires the mutex in exclusive mode
+   *
+   * This returns an ephemeral proxy that contains internal mutex state.  This
+   * must be kept around for the duration of the critical section and passed
+   * subsequently to unlock() as an rvalue
+   *
+   * The proxy has no public API and is intended to be for internal usage only
+   *
+   * There are three notable cases where this method causes undefined
+   * behavior:
+   *
+   *  - This is not a recursive mutex.  Trying to acquire the mutex twice from
+   *    the same thread without unlocking it results in undefined behavior
+   *  - Thread, coroutine or fiber migrations from within a critical section
+   *    are disallowed.  This is because the implementation requires owning the
+   *    stack frame through the execution of the critical section for both
+   *    lock/unlock or combined critical sections.  This also means that you
+   *    cannot allow another thread, fiber or coroutine to unlock the mutex
+   *  - This mutex cannot be used in a program compiled with segmented stacks,
+   *    there is currently no way to detect the presence of segmented stacks
+   *    at compile time or runtime, so we have no checks against this
+   */
+  DistributedMutexStateProxy lock();
+
+  /**
+   * Unlocks the mutex
+   *
+   * The proxy returned by lock must be passed to unlock as an rvalue.  No
+   * other option is possible here, since the proxy is only movable and not
+   * copyable
+   *
+   * It is undefined behavior to unlock from a thread that did not lock the
+   * mutex
+   */
+  void unlock(DistributedMutexStateProxy);
+
+  /**
+   * Try to acquire the mutex
+   *
+   * A non blocking version of the lock() function.  The returned object is
+   * contextually convertible to bool.  And has the value true when the mutex
+   * was successfully acquired, false otherwise
+   *
+   * This is allowed to return false spuriously, i.e. this is not guaranteed
+   * to return true even when the mutex is currently unlocked.  In the event
+   * of a failed acquisition, this does not impose any memory ordering
+   * constraints for other threads
+   */
+  DistributedMutexStateProxy try_lock();
+
+  /**
+   * Try to acquire the mutex, blocking for the given time
+   *
+   * Like try_lock(), this is allowed to fail spuriously and is not guaranteed
+   * to return false even when the mutex is currently unlocked.  But only
+   * after the given time has elapsed
+   *
+   * try_lock_for() accepts a duration to block for, and try_lock_until()
+   * accepts an absolute wall clock time point
+   */
+  template <typename Rep, typename Period>
+  DistributedMutexStateProxy try_lock_for(
+      const std::chrono::duration<Rep, Period>& duration);
+
+  /**
+   * Try to acquire the lock, blocking until the given deadline
+   *
+   * Other than the difference in the meaning of the second argument, the
+   * semantics of this function are identical to try_lock_for()
+   */
+  template <typename Clock, typename Duration>
+  DistributedMutexStateProxy try_lock_until(
+      const std::chrono::time_point<Clock, Duration>& deadline);
+
+  /**
+   * Execute a task as a combined critical section
+   *
+   * Unlike traditional lock and unlock methods, lock_combine() enqueues the
+   * passed task for execution on any arbitrary thread.  This allows the
+   * implementation to prevent cache line invalidations originating from
+   * expensive synchronization operations.  The thread holding the lock is
+   * allowed to execute the task before unlocking, thereby forming a "combined
+   * critical section".
+   *
+   * This idea is inspired by Flat Combining.  Flat Combining was introduced
+   * in the SPAA 2010 paper titled "Flat Combining and the
+   * Synchronization-Parallelism Tradeoff", by Danny Hendler, Itai Incze, Nir
+   * Shavit, and Moran Tzafrir -
+   * https://www.cs.bgu.ac.il/~hendlerd/papers/flat-combining.pdf.  The
+   * implementation used here is significantly different from that described
+   * in the paper.  The high-level goal of reducing the overhead of
+   * synchronization, however, is the same.
+   *
+   * Combined critical sections work best when kept simple.  Since the
+   * critical section might be executed on any arbitrary thread, relying on
+   * things like thread local state or mutex locking and unlocking might cause
+   * incorrectness.  Associativity is important.  For example
+   *
+   *    auto one = std::unique_lock{one_};
+   *    two_.lock_combine([&]() {
+   *      if (bar()) {
+   *        one.unlock();
+   *      }
+   *    });
+   *
+   * This has the potential to cause undefined behavior because mutexes are
+   * only meant to be acquired and released from the owning thread.  Similar
+   * errors can arise from a combined critical section introducing implicit
+   * dependencies based on the state of the combining thread.  For example
+   *
+   *    // thread 1
+   *    auto one = std::unique_lock{one_};
+   *    auto two = std::unique_lock{two_};
+   *
+   *    // thread 2
+   *    two_.lock_combine([&]() {
+   *      auto three = std::unique_lock{three_};
+   *    });
+   *
+   * Here, because we used a combined critical section, we have introduced a
+   * dependency from one -> three that might not obvious to the reader
+   *
+   * This function is exception-safe.  If the passed task throws an exception,
+   * it will be propagated to the caller, even if the task is running on
+   * another thread
+   *
+   * There are three notable cases where this method causes undefined
+   * behavior:
+   *
+   *  - This is not a recursive mutex.  Trying to acquire the mutex twice from
+   *    the same thread without unlocking it results in undefined behavior
+   *  - Thread, coroutine or fiber migrations from within a critical section
+   *    are disallowed.  This is because the implementation requires owning the
+   *    stack frame through the execution of the critical section for both
+   *    lock/unlock or combined critical sections.  This also means that you
+   *    cannot allow another thread, fiber or coroutine to unlock the mutex
+   *  - This mutex cannot be used in a program compiled with segmented stacks,
+   *    there is currently no way to detect the presence of segmented stacks
+   *    at compile time or runtime, so we have no checks against this
+   */
+  template <typename Task>
+  auto lock_combine(Task task) -> decltype(std::declval<const Task&>()());
+
+ private:
+  Atomic<std::uintptr_t> state_{0};
+};
+
+} // namespace distributed_mutex
+} // namespace detail
+
+/**
+ * Bring the default instantiation of DistributedMutex into the folly
+ * namespace without requiring any template arguments for public usage
+ */
+extern template class detail::distributed_mutex::DistributedMutex<>;
+using DistributedMutex = detail::distributed_mutex::DistributedMutex<>;
+
+} // namespace folly
+
+#include <folly/synchronization/DistributedMutex-inl.h>
+#include <folly/synchronization/DistributedMutexSpecializations.h>
diff --git a/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h b/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h
new file mode 100644
index 00000000000..451aa69bcf0
--- /dev/null
+++ b/third-party/folly/folly/synchronization/DistributedMutexSpecializations.h
@@ -0,0 +1,39 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/synchronization/DistributedMutex.h>
+#include <folly/synchronization/detail/ProxyLockable.h>
+
+/**
+ * Specializations for DistributedMutex allow us to use it like a normal
+ * mutex.  Even though it has a non-usual interface
+ */
+namespace std {
+template <template <typename> class Atom, bool TimePublishing>
+class unique_lock<
+    ::folly::detail::distributed_mutex::DistributedMutex<Atom, TimePublishing>>
+    : public ::folly::detail::ProxyLockableUniqueLock<
+          ::folly::detail::distributed_mutex::
+              DistributedMutex<Atom, TimePublishing>> {
+ public:
+  using ::folly::detail::ProxyLockableUniqueLock<
+      ::folly::detail::distributed_mutex::
+          DistributedMutex<Atom, TimePublishing>>::ProxyLockableUniqueLock;
+};
+
+template <template <typename> class Atom, bool TimePublishing>
+class lock_guard<
+    ::folly::detail::distributed_mutex::DistributedMutex<Atom, TimePublishing>>
+    : public ::folly::detail::ProxyLockableLockGuard<
+          ::folly::detail::distributed_mutex::
+              DistributedMutex<Atom, TimePublishing>> {
+ public:
+  using ::folly::detail::ProxyLockableLockGuard<
+      ::folly::detail::distributed_mutex::
+          DistributedMutex<Atom, TimePublishing>>::ProxyLockableLockGuard;
+};
+} // namespace std
diff --git a/third-party/folly/folly/synchronization/ParkingLot.cpp b/third-party/folly/folly/synchronization/ParkingLot.cpp
new file mode 100644
index 00000000000..74fba8e936b
--- /dev/null
+++ b/third-party/folly/folly/synchronization/ParkingLot.cpp
@@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/ParkingLot.h>
+
+#include <array>
+
+namespace folly {
+namespace parking_lot_detail {
+
+Bucket& Bucket::bucketFor(uint64_t key) {
+  constexpr size_t const kNumBuckets = 4096;
+
+  // Statically allocating this lets us use this in allocation-sensitive
+  // contexts. This relies on the assumption that std::mutex won't dynamically
+  // allocate memory, which we assume to be the case on Linux and iOS.
+  static Indestructible<std::array<Bucket, kNumBuckets>> gBuckets;
+  return (*gBuckets)[key % kNumBuckets];
+}
+
+std::atomic<uint64_t> idallocator{0};
+
+} // namespace parking_lot_detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/ParkingLot.h b/third-party/folly/folly/synchronization/ParkingLot.h
new file mode 100644
index 00000000000..bb324fb0a77
--- /dev/null
+++ b/third-party/folly/folly/synchronization/ParkingLot.h
@@ -0,0 +1,318 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+
+#include <folly/hash/Hash.h>
+#include <folly/Indestructible.h>
+#include <folly/Unit.h>
+
+namespace folly {
+
+namespace parking_lot_detail {
+
+struct WaitNodeBase {
+  const uint64_t key_;
+  const uint64_t lotid_;
+  WaitNodeBase* next_{nullptr};
+  WaitNodeBase* prev_{nullptr};
+
+  // tricky: hold both bucket and node mutex to write, either to read
+  bool signaled_;
+  std::mutex mutex_;
+  std::condition_variable cond_;
+
+  WaitNodeBase(uint64_t key, uint64_t lotid)
+      : key_(key), lotid_(lotid), signaled_(false) {}
+
+  template <typename Clock, typename Duration>
+  std::cv_status wait(std::chrono::time_point<Clock, Duration> deadline) {
+    std::cv_status status = std::cv_status::no_timeout;
+    std::unique_lock<std::mutex> nodeLock(mutex_);
+    while (!signaled_ && status != std::cv_status::timeout) {
+      if (deadline != std::chrono::time_point<Clock, Duration>::max()) {
+        status = cond_.wait_until(nodeLock, deadline);
+      } else {
+        cond_.wait(nodeLock);
+      }
+    }
+    return status;
+  }
+
+  void wake() {
+    std::lock_guard<std::mutex> nodeLock(mutex_);
+    signaled_ = true;
+    cond_.notify_one();
+  }
+
+  bool signaled() {
+    return signaled_;
+  }
+};
+
+extern std::atomic<uint64_t> idallocator;
+
+// Our emulated futex uses 4096 lists of wait nodes.  There are two levels
+// of locking: a per-list mutex that controls access to the list and a
+// per-node mutex, condvar, and bool that are used for the actual wakeups.
+// The per-node mutex allows us to do precise wakeups without thundering
+// herds.
+struct Bucket {
+  std::mutex mutex_;
+  WaitNodeBase* head_;
+  WaitNodeBase* tail_;
+  std::atomic<uint64_t> count_;
+
+  static Bucket& bucketFor(uint64_t key);
+
+  void push_back(WaitNodeBase* node) {
+    if (tail_) {
+      assert(head_);
+      node->prev_ = tail_;
+      tail_->next_ = node;
+      tail_ = node;
+    } else {
+      tail_ = node;
+      head_ = node;
+    }
+  }
+
+  void erase(WaitNodeBase* node) {
+    assert(count_.load(std::memory_order_relaxed) >= 1);
+    if (head_ == node && tail_ == node) {
+      assert(node->prev_ == nullptr);
+      assert(node->next_ == nullptr);
+      head_ = nullptr;
+      tail_ = nullptr;
+    } else if (head_ == node) {
+      assert(node->prev_ == nullptr);
+      assert(node->next_);
+      head_ = node->next_;
+      head_->prev_ = nullptr;
+    } else if (tail_ == node) {
+      assert(node->next_ == nullptr);
+      assert(node->prev_);
+      tail_ = node->prev_;
+      tail_->next_ = nullptr;
+    } else {
+      assert(node->next_);
+      assert(node->prev_);
+      node->next_->prev_ = node->prev_;
+      node->prev_->next_ = node->next_;
+    }
+    count_.fetch_sub(1, std::memory_order_relaxed);
+  }
+};
+
+} // namespace parking_lot_detail
+
+enum class UnparkControl {
+  RetainContinue,
+  RemoveContinue,
+  RetainBreak,
+  RemoveBreak,
+};
+
+enum class ParkResult {
+  Skip,
+  Unpark,
+  Timeout,
+};
+
+/*
+ * ParkingLot provides an interface that is similar to Linux's futex
+ * system call, but with additional functionality.  It is implemented
+ * in a portable way on top of std::mutex and std::condition_variable.
+ *
+ * Additional reading:
+ * https://webkit.org/blog/6161/locking-in-webkit/
+ * https://github.com/WebKit/webkit/blob/master/Source/WTF/wtf/ParkingLot.h
+ * https://locklessinc.com/articles/futex_cheat_sheet/
+ *
+ * The main difference from futex is that park/unpark take lambdas,
+ * such that nearly anything can be done while holding the bucket
+ * lock.  Unpark() lambda can also be used to wake up any number of
+ * waiters.
+ *
+ * ParkingLot is templated on the data type, however, all ParkingLot
+ * implementations are backed by a single static array of buckets to
+ * avoid large memory overhead.  Lambdas will only ever be called on
+ * the specific ParkingLot's nodes.
+ */
+template <typename Data = Unit>
+class ParkingLot {
+  const uint64_t lotid_;
+  ParkingLot(const ParkingLot&) = delete;
+
+  struct WaitNode : public parking_lot_detail::WaitNodeBase {
+    const Data data_;
+
+    template <typename D>
+    WaitNode(uint64_t key, uint64_t lotid, D&& data)
+        : WaitNodeBase(key, lotid), data_(std::forward<D>(data)) {}
+  };
+
+ public:
+  ParkingLot() : lotid_(parking_lot_detail::idallocator++) {}
+
+  /* Park API
+   *
+   * Key is almost always the address of a variable.
+   *
+   * ToPark runs while holding the bucket lock: usually this
+   * is a check to see if we can sleep, by checking waiter bits.
+   *
+   * PreWait is usually used to implement condition variable like
+   * things, such that you can unlock the condition variable's lock at
+   * the appropriate time.
+   */
+  template <typename Key, typename D, typename ToPark, typename PreWait>
+  ParkResult park(const Key key, D&& data, ToPark&& toPark, PreWait&& preWait) {
+    return park_until(
+        key,
+        std::forward<D>(data),
+        std::forward<ToPark>(toPark),
+        std::forward<PreWait>(preWait),
+        std::chrono::steady_clock::time_point::max());
+  }
+
+  template <
+      typename Key,
+      typename D,
+      typename ToPark,
+      typename PreWait,
+      typename Clock,
+      typename Duration>
+  ParkResult park_until(
+      const Key key,
+      D&& data,
+      ToPark&& toPark,
+      PreWait&& preWait,
+      std::chrono::time_point<Clock, Duration> deadline);
+
+  template <
+      typename Key,
+      typename D,
+      typename ToPark,
+      typename PreWait,
+      typename Rep,
+      typename Period>
+  ParkResult park_for(
+      const Key key,
+      D&& data,
+      ToPark&& toPark,
+      PreWait&& preWait,
+      std::chrono::duration<Rep, Period>& timeout) {
+    return park_until(
+        key,
+        std::forward<D>(data),
+        std::forward<ToPark>(toPark),
+        std::forward<PreWait>(preWait),
+        timeout + std::chrono::steady_clock::now());
+  }
+
+  /*
+   * Unpark API
+   *
+   * Key is the same uniqueaddress used in park(), and is used as a
+   * hash key for lookup of waiters.
+   *
+   * Unparker is a function that is given the Data parameter, and
+   * returns an UnparkControl.  The Remove* results will remove and
+   * wake the waiter, the Ignore/Stop results will not, while stopping
+   * or continuing iteration of the waiter list.
+   */
+  template <typename Key, typename Unparker>
+  void unpark(const Key key, Unparker&& func);
+};
+
+template <typename Data>
+template <
+    typename Key,
+    typename D,
+    typename ToPark,
+    typename PreWait,
+    typename Clock,
+    typename Duration>
+ParkResult ParkingLot<Data>::park_until(
+    const Key bits,
+    D&& data,
+    ToPark&& toPark,
+    PreWait&& preWait,
+    std::chrono::time_point<Clock, Duration> deadline) {
+  auto key = hash::twang_mix64(uint64_t(bits));
+  auto& bucket = parking_lot_detail::Bucket::bucketFor(key);
+  WaitNode node(key, lotid_, std::forward<D>(data));
+
+  {
+    // A: Must be seq_cst.  Matches B.
+    bucket.count_.fetch_add(1, std::memory_order_seq_cst);
+
+    std::unique_lock<std::mutex> bucketLock(bucket.mutex_);
+
+    if (!std::forward<ToPark>(toPark)()) {
+      bucketLock.unlock();
+      bucket.count_.fetch_sub(1, std::memory_order_relaxed);
+      return ParkResult::Skip;
+    }
+
+    bucket.push_back(&node);
+  } // bucketLock scope
+
+  std::forward<PreWait>(preWait)();
+
+  auto status = node.wait(deadline);
+
+  if (status == std::cv_status::timeout) {
+    // it's not really a timeout until we unlink the unsignaled node
+    std::lock_guard<std::mutex> bucketLock(bucket.mutex_);
+    if (!node.signaled()) {
+      bucket.erase(&node);
+      return ParkResult::Timeout;
+    }
+  }
+
+  return ParkResult::Unpark;
+}
+
+template <typename Data>
+template <typename Key, typename Func>
+void ParkingLot<Data>::unpark(const Key bits, Func&& func) {
+  auto key = hash::twang_mix64(uint64_t(bits));
+  auto& bucket = parking_lot_detail::Bucket::bucketFor(key);
+  // B: Must be seq_cst.  Matches A.  If true, A *must* see in seq_cst
+  // order any atomic updates in toPark() (and matching updates that
+  // happen before unpark is called)
+  if (bucket.count_.load(std::memory_order_seq_cst) == 0) {
+    return;
+  }
+
+  std::lock_guard<std::mutex> bucketLock(bucket.mutex_);
+
+  for (auto iter = bucket.head_; iter != nullptr;) {
+    auto node = static_cast<WaitNode*>(iter);
+    iter = iter->next_;
+    if (node->key_ == key && node->lotid_ == lotid_) {
+      auto result = std::forward<Func>(func)(node->data_);
+      if (result == UnparkControl::RemoveBreak ||
+          result == UnparkControl::RemoveContinue) {
+        // we unlink, but waiter destroys the node
+        bucket.erase(node);
+
+        node->wake();
+      }
+      if (result == UnparkControl::RemoveBreak ||
+          result == UnparkControl::RetainBreak) {
+        return;
+      }
+    }
+  }
+}
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/WaitOptions.cpp b/third-party/folly/folly/synchronization/WaitOptions.cpp
new file mode 100644
index 00000000000..0c1fe2b93bd
--- /dev/null
+++ b/third-party/folly/folly/synchronization/WaitOptions.cpp
@@ -0,0 +1,12 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/WaitOptions.h>
+
+namespace folly {
+
+constexpr std::chrono::nanoseconds WaitOptions::Defaults::spin_max;
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/WaitOptions.h b/third-party/folly/folly/synchronization/WaitOptions.h
new file mode 100644
index 00000000000..b28deb54d0d
--- /dev/null
+++ b/third-party/folly/folly/synchronization/WaitOptions.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <chrono>
+
+namespace folly {
+
+/// WaitOptions
+///
+/// Various synchronization primitives as well as various concurrent data
+/// structures built using them have operations which might wait. This type
+/// represents a set of options for controlling such waiting.
+class WaitOptions {
+ public:
+  struct Defaults {
+    /// spin_max
+    ///
+    /// If multiple threads are actively using a synchronization primitive,
+    /// whether indirectly via a higher-level concurrent data structure or
+    /// directly, where the synchronization primitive has an operation which
+    /// waits and another operation which wakes the waiter, it is common for
+    /// wait and wake events to happen almost at the same time. In this state,
+    /// we lose big 50% of the time if the wait blocks immediately.
+    ///
+    /// We can improve our chances of being waked immediately, before blocking,
+    /// by spinning for a short duration, although we have to balance this
+    /// against the extra cpu utilization, latency reduction, power consumption,
+    /// and priority inversion effect if we end up blocking anyway.
+    ///
+    /// We use a default maximum of 2 usec of spinning. As partial consolation,
+    /// since spinning as implemented in folly uses the pause instruction where
+    /// available, we give a small speed boost to the colocated hyperthread.
+    ///
+    /// On circa-2013 devbox hardware, it costs about 7 usec to FUTEX_WAIT and
+    /// then be awoken. Spins on this hw take about 7 nsec, where all but 0.5
+    /// nsec is the pause instruction.
+    static constexpr std::chrono::nanoseconds spin_max =
+        std::chrono::microseconds(2);
+  };
+
+  std::chrono::nanoseconds spin_max() const {
+    return spin_max_;
+  }
+  WaitOptions& spin_max(std::chrono::nanoseconds dur) {
+    spin_max_ = dur;
+    return *this;
+  }
+
+ private:
+  std::chrono::nanoseconds spin_max_ = Defaults::spin_max;
+};
+
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h b/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h
new file mode 100644
index 00000000000..6782c792eae
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/InlineFunctionRef.h
@@ -0,0 +1,219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include <folly/Traits.h>
+#include <folly/Utility.h>
+#include <folly/functional/Invoke.h>
+#include <folly/lang/Launder.h>
+
+namespace folly {
+namespace detail {
+
+/**
+ * InlineFunctionRef is similar to folly::FunctionRef but has the additional
+ * benefit of being able to store the function it was instantiated with inline
+ * in a buffer of the given capacity.  Inline storage is only used if the
+ * function object and a pointer (for type-erasure) are small enough to fit in
+ * the templated size.  If there is not enough in-situ capacity for the
+ * callable, this just stores a reference to the function object like
+ * FunctionRef.
+ *
+ * This helps give a perf boost in the case where the data gets separated from
+ * the point of invocation.  If, for example, at the point of invocation, the
+ * InlineFunctionRef object is not cached, a remote memory/cache read might be
+ * required to invoke the original callable.  Customizable inline storage
+ * helps tune storage so we can store a type-erased callable with better
+ * performance and locality.  A real-life example of this might be a
+ * folly::FunctionRef with a function pointer.  The folly::FunctionRef would
+ * point to the function pointer object in a remote location.  This causes a
+ * double-indirection at the point of invocation, and if that memory is dirty,
+ * or not cached, it would cause additional cache misses.  On the other hand
+ * with InlineFunctionRef, inline storage would store the value of the
+ * function pointer, avoiding the need to do a remote lookup to fetch the
+ * value of the function pointer.
+ *
+ * To prevent misuse, InlineFunctionRef disallows construction from an lvalue
+ * callable.  This is to prevent usage where a user relies on the callable's
+ * state after invocation through InlineFunctionRef.  This has the potential
+ * to copy the callable into inline storage when the callable is small, so we
+ * might not use the same function when invoking, but rather a copy of it.
+ *
+ * Also note that InlineFunctionRef will always invoke the const qualified
+ * version of the call operator for any callable that is passed.  Regardless
+ * of whether it has a non-const version.  This is done to enforce the logical
+ * constraint of function state being immutable.
+ *
+ * This class is always trivially-copyable (and therefore
+ * trivially-destructible), making it suitable for use in a union without
+ * requiring manual destruction.
+ */
+template <typename FunctionType, std::size_t Size>
+class InlineFunctionRef;
+
+template <typename ReturnType, typename... Args, std::size_t Size>
+class InlineFunctionRef<ReturnType(Args...), Size> {
+  using Storage =
+      _t<std::aligned_storage<Size - sizeof(uintptr_t), sizeof(uintptr_t)>>;
+  using Call = ReturnType (*)(const Storage&, Args&&...);
+
+  struct InSituTag {};
+  struct RefTag {};
+
+  static_assert(
+      (Size % sizeof(uintptr_t)) == 0,
+      "Size has to be a multiple of sizeof(uintptr_t)");
+  static_assert(Size >= 2 * sizeof(uintptr_t), "This doesn't work");
+  static_assert(alignof(Call) == alignof(Storage), "Mismatching alignments");
+
+  // This defines a mode tag that is used in the construction of
+  // InlineFunctionRef to determine the storage and indirection method for the
+  // passed callable.
+  //
+  // This requires that the we pass in a type that is not ref-qualified.
+  template <typename Func>
+  using ConstructMode = _t<std::conditional<
+      folly::is_trivially_copyable<Func>{} &&
+          (sizeof(Func) <= sizeof(Storage)) &&
+          (alignof(Func) <= alignof(Storage)),
+      InSituTag,
+      RefTag>>;
+
+ public:
+  /**
+   * InlineFunctionRef can be constructed from a nullptr, callable or another
+   * InlineFunctionRef with the same size.  These are the constructors that
+   * don't take a callable.
+   *
+   * InlineFunctionRef is meant to be trivially copyable so we default the
+   * constructors and assignment operators.
+   */
+  InlineFunctionRef(std::nullptr_t) : call_{nullptr} {}
+  InlineFunctionRef() : call_{nullptr} {}
+  InlineFunctionRef(const InlineFunctionRef& other) = default;
+  InlineFunctionRef(InlineFunctionRef&&) = default;
+  InlineFunctionRef& operator=(const InlineFunctionRef&) = default;
+  InlineFunctionRef& operator=(InlineFunctionRef&&) = default;
+
+  /**
+   * Constructors from callables.
+   *
+   * If all of the following conditions are satisfied, then we store the
+   * callable in the inline storage:
+   *
+   *  1) The function has been passed as an rvalue, meaning that there is no
+   *     use of the original in the user's code after it has been passed to
+   *     us.
+   *  2) Size of the callable is less than the size of the inline storage
+   *     buffer.
+   *  3) The callable is trivially constructible and destructible.
+   *
+   * If any one of the above conditions is not satisfied, we fall back to
+   * reference semantics and store the function as a pointer, and add a level
+   * of indirection through type erasure.
+   */
+  template <
+      typename Func,
+      _t<std::enable_if<
+          !std::is_same<_t<std::decay<Func>>, InlineFunctionRef>{} &&
+          !std::is_reference<Func>{} &&
+          std::is_convertible<
+              decltype(std::declval<Func&&>()(std::declval<Args&&>()...)),
+              ReturnType>{}>>* = nullptr>
+  InlineFunctionRef(Func&& func) {
+    // We disallow construction from lvalues, so assert that this is not a
+    // reference type.  When invoked with an lvalue, Func is a lvalue
+    // reference type, when invoked with an rvalue, Func is not ref-qualified.
+    static_assert(
+        !std::is_reference<Func>{},
+        "InlineFunctionRef cannot be used with lvalues");
+    static_assert(std::is_rvalue_reference<Func&&>{}, "");
+    construct(ConstructMode<Func>{}, folly::as_const(func));
+  }
+
+  /**
+   * The call operator uses the function pointer and a reference to the
+   * storage to do the dispatch.  The function pointer takes care of the
+   * appropriate casting.
+   */
+  ReturnType operator()(Args... args) const {
+    return call_(storage_, static_cast<Args&&>(args)...);
+  }
+
+  /**
+   * We have a function engaged if the call function points to anything other
+   * than null.
+   */
+  operator bool() const noexcept {
+    return call_;
+  }
+
+ private:
+  friend class InlineFunctionRefTest;
+
+  /**
+   * Inline storage constructor implementation.
+   */
+  template <typename Func>
+  void construct(InSituTag, Func& func) {
+    using Value = _t<std::remove_reference<Func>>;
+
+    // Assert that the following two assumptions are valid
+    //    1) fit in the storage space we have and match alignments, and
+    //    2) be invocable in a const context, it does not make sense to copy a
+    //       callable into inline storage if it makes state local
+    //       modifications.
+    static_assert(alignof(Value) <= alignof(Storage), "");
+    static_assert(is_invocable<const _t<std::decay<Func>>, Args&&...>{}, "");
+    static_assert(folly::is_trivially_copyable<Value>{}, "");
+
+    new (&storage_) Value{func};
+    call_ = &callInline<Value>;
+  }
+
+  /**
+   * Ref storage constructor implementation.  This is identical to
+   * folly::FunctionRef.
+   */
+  template <typename Func>
+  void construct(RefTag, Func& func) {
+    // store a pointer to the function
+    using Pointer = _t<std::add_pointer<_t<std::remove_reference<Func>>>>;
+    new (&storage_) Pointer{&func};
+    call_ = &callPointer<Pointer>;
+  }
+
+  template <typename Func>
+  static ReturnType callInline(const Storage& object, Args&&... args) {
+    // The only type of pointer allowed is a function pointer, no other
+    // pointer types are invocable.
+    static_assert(
+        !std::is_pointer<Func>::value ||
+            std::is_function<_t<std::remove_pointer<Func>>>::value,
+        "");
+    return (*folly::launder(reinterpret_cast<const Func*>(&object)))(
+        static_cast<Args&&>(args)...);
+  }
+
+  template <typename Func>
+  static ReturnType callPointer(const Storage& object, Args&&... args) {
+    // When the function we were instantiated with was not trivial, the given
+    // pointer points to a pointer, which pointers to the callable.  So we
+    // cast to a pointer and then to the pointee.
+    static_assert(std::is_pointer<Func>::value, "");
+    return (**folly::launder(reinterpret_cast<const Func*>(&object)))(
+        static_cast<Args&&>(args)...);
+  }
+
+  Call call_;
+  Storage storage_;
+};
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h b/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h
new file mode 100644
index 00000000000..573330ceb08
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/ProxyLockable-inl.h
@@ -0,0 +1,207 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Optional.h>
+#include <folly/Portability.h>
+#include <folly/Utility.h>
+
+#include <cassert>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <utility>
+
+namespace folly {
+namespace detail {
+namespace proxylockable_detail {
+template <typename Bool>
+void throwIfAlreadyLocked(Bool&& locked) {
+  if (kIsDebug && locked) {
+    throw std::system_error{
+        std::make_error_code(std::errc::resource_deadlock_would_occur)};
+  }
+}
+
+template <typename Bool>
+void throwIfNotLocked(Bool&& locked) {
+  if (kIsDebug && !locked) {
+    throw std::system_error{
+        std::make_error_code(std::errc::operation_not_permitted)};
+  }
+}
+
+template <typename Bool>
+void throwIfNoMutex(Bool&& mutex) {
+  if (kIsDebug && !mutex) {
+    throw std::system_error{
+        std::make_error_code(std::errc::operation_not_permitted)};
+  }
+}
+} // namespace proxylockable_detail
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::~ProxyLockableUniqueLock() {
+  if (owns_lock()) {
+    unlock();
+  }
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx) noexcept {
+  proxy_.emplace(mtx.lock());
+  mutex_ = std::addressof(mtx);
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    ProxyLockableUniqueLock&& a) noexcept {
+  *this = std::move(a);
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>& ProxyLockableUniqueLock<Mutex>::operator=(
+    ProxyLockableUniqueLock&& other) noexcept {
+  proxy_ = std::move(other.proxy_);
+  mutex_ = folly::exchange(other.mutex_, nullptr);
+  return *this;
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx,
+    std::defer_lock_t) noexcept {
+  mutex_ = std::addressof(mtx);
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx,
+    std::try_to_lock_t) {
+  mutex_ = std::addressof(mtx);
+  if (auto state = mtx.try_lock()) {
+    proxy_.emplace(std::move(state));
+  }
+}
+
+template <typename Mutex>
+template <typename Rep, typename Period>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx,
+    const std::chrono::duration<Rep, Period>& duration) {
+  mutex_ = std::addressof(mtx);
+  if (auto state = mtx.try_lock_for(duration)) {
+    proxy_.emplace(std::move(state));
+  }
+}
+
+template <typename Mutex>
+template <typename Clock, typename Duration>
+ProxyLockableUniqueLock<Mutex>::ProxyLockableUniqueLock(
+    mutex_type& mtx,
+    const std::chrono::time_point<Clock, Duration>& time) {
+  mutex_ = std::addressof(mtx);
+  if (auto state = mtx.try_lock_until(time)) {
+    proxy_.emplace(std::move(state));
+  }
+}
+
+template <typename Mutex>
+void ProxyLockableUniqueLock<Mutex>::lock() {
+  proxylockable_detail::throwIfAlreadyLocked(proxy_);
+  proxylockable_detail::throwIfNoMutex(mutex_);
+
+  proxy_.emplace(mutex_->lock());
+}
+
+template <typename Mutex>
+void ProxyLockableUniqueLock<Mutex>::unlock() {
+  proxylockable_detail::throwIfNoMutex(mutex_);
+  proxylockable_detail::throwIfNotLocked(proxy_);
+
+  mutex_->unlock(std::move(*proxy_));
+  proxy_.reset();
+}
+
+template <typename Mutex>
+bool ProxyLockableUniqueLock<Mutex>::try_lock() {
+  proxylockable_detail::throwIfNoMutex(mutex_);
+  proxylockable_detail::throwIfAlreadyLocked(proxy_);
+
+  if (auto state = mutex_->try_lock()) {
+    proxy_.emplace(std::move(state));
+    return true;
+  }
+
+  return false;
+}
+
+template <typename Mutex>
+template <typename Rep, typename Period>
+bool ProxyLockableUniqueLock<Mutex>::try_lock_for(
+    const std::chrono::duration<Rep, Period>& duration) {
+  proxylockable_detail::throwIfNoMutex(mutex_);
+  proxylockable_detail::throwIfAlreadyLocked(proxy_);
+
+  if (auto state = mutex_->try_lock_for(duration)) {
+    proxy_.emplace(std::move(state));
+    return true;
+  }
+
+  return false;
+}
+
+template <typename Mutex>
+template <typename Clock, typename Duration>
+bool ProxyLockableUniqueLock<Mutex>::try_lock_until(
+    const std::chrono::time_point<Clock, Duration>& time) {
+  proxylockable_detail::throwIfNoMutex(mutex_);
+  proxylockable_detail::throwIfAlreadyLocked(proxy_);
+
+  if (auto state = mutex_->try_lock_until(time)) {
+    proxy_.emplace(std::move(state));
+    return true;
+  }
+
+  return false;
+}
+
+template <typename Mutex>
+void ProxyLockableUniqueLock<Mutex>::swap(
+    ProxyLockableUniqueLock& other) noexcept {
+  std::swap(mutex_, other.mutex_);
+  std::swap(proxy_, other.proxy_);
+}
+
+template <typename Mutex>
+typename ProxyLockableUniqueLock<Mutex>::mutex_type*
+ProxyLockableUniqueLock<Mutex>::mutex() const noexcept {
+  return mutex_;
+}
+
+template <typename Mutex>
+typename ProxyLockableUniqueLock<Mutex>::proxy_type*
+ProxyLockableUniqueLock<Mutex>::proxy() const noexcept {
+  return proxy_ ? std::addressof(proxy_.value()) : nullptr;
+}
+
+template <typename Mutex>
+bool ProxyLockableUniqueLock<Mutex>::owns_lock() const noexcept {
+  return proxy_.has_value();
+}
+
+template <typename Mutex>
+ProxyLockableUniqueLock<Mutex>::operator bool() const noexcept {
+  return owns_lock();
+}
+
+template <typename Mutex>
+ProxyLockableLockGuard<Mutex>::ProxyLockableLockGuard(mutex_type& mtx)
+    : ProxyLockableUniqueLock<Mutex>{mtx} {}
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/detail/ProxyLockable.h b/third-party/folly/folly/synchronization/detail/ProxyLockable.h
new file mode 100644
index 00000000000..af922daf4a6
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/ProxyLockable.h
@@ -0,0 +1,164 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <folly/Optional.h>
+
+#include <mutex>
+
+namespace folly {
+namespace detail {
+
+/**
+ * ProxyLockable is a "concept" that is used usually for mutexes that don't
+ * return void, but rather a proxy object that contains data that should be
+ * passed to the unlock function.
+ *
+ * This is in contrast with the normal Lockable concept that imposes no
+ * requirement on the return type of lock(), and requires an unlock() with no
+ * parameters.  Here we require that lock() returns non-void and that unlock()
+ * accepts the return type of lock() by value, rvalue-reference or
+ * const-reference
+ *
+ * Here we define two classes, that can be used by the top level to implement
+ * specializations for std::unique_lock and std::lock_guard.  Both
+ * ProxyLockableUniqueLock and ProxyLockableLockGuard implement the entire
+ * interface of std::unique_lock and std::lock_guard respectively
+ */
+template <typename Mutex>
+class ProxyLockableUniqueLock {
+ public:
+  using mutex_type = Mutex;
+  using proxy_type =
+      _t<std::decay<decltype(std::declval<mutex_type>().lock())>>;
+
+  /**
+   * Default constructor initializes the unique_lock to an empty state
+   */
+  ProxyLockableUniqueLock() = default;
+
+  /**
+   * Destructor releases the mutex if it is locked
+   */
+  ~ProxyLockableUniqueLock();
+
+  /**
+   * Move constructor and move assignment operators take state from the other
+   * lock
+   */
+  ProxyLockableUniqueLock(ProxyLockableUniqueLock&& other) noexcept;
+  ProxyLockableUniqueLock& operator=(ProxyLockableUniqueLock&&) noexcept;
+
+  /**
+   * Locks the mutex, blocks until the mutex can be acquired.
+   *
+   * The mutex is guaranteed to be acquired after this function returns.
+   */
+  ProxyLockableUniqueLock(mutex_type&) noexcept;
+
+  /**
+   * Explicit locking constructors to control how the lock() method is called
+   *
+   * std::defer_lock_t causes the mutex to get tracked, but not locked
+   * std::try_to_lock_t causes try_lock() to be called.  The current object is
+   *                    converts to true if the lock was successful
+   */
+  ProxyLockableUniqueLock(mutex_type& mtx, std::defer_lock_t) noexcept;
+  ProxyLockableUniqueLock(mutex_type& mtx, std::try_to_lock_t);
+
+  /**
+   * Timed locking constructors
+   */
+  template <typename Rep, typename Period>
+  ProxyLockableUniqueLock(
+      mutex_type& mtx,
+      const std::chrono::duration<Rep, Period>& duration);
+  template <typename Clock, typename Duration>
+  ProxyLockableUniqueLock(
+      mutex_type& mtx,
+      const std::chrono::time_point<Clock, Duration>& time);
+
+  /**
+   * Lock and unlock methods
+   *
+   * lock() and try_lock() throw if the mutex is already locked, or there is
+   * no mutex.  unlock() throws if there is no mutex or if the mutex was not
+   * locked
+   */
+  void lock();
+  void unlock();
+  bool try_lock();
+
+  /**
+   * Timed locking methods
+   *
+   * These throw if there was no mutex, or if the mutex was already locked
+   */
+  template <typename Rep, typename Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& duration);
+  template <typename Clock, typename Duration>
+  bool try_lock_until(const std::chrono::time_point<Clock, Duration>& time);
+
+  /**
+   * Swap this unique lock with the other one
+   */
+  void swap(ProxyLockableUniqueLock& other) noexcept;
+
+  /**
+   * Returns true if the unique lock contains a lock and also has acquired an
+   * exclusive lock successfully
+   */
+  bool owns_lock() const noexcept;
+  explicit operator bool() const noexcept;
+
+  /**
+   * mutex() return a pointer to the mutex if there is a contained mutex and
+   * proxy() returns a pointer to the contained proxy if the mutex is locked
+   *
+   * If the unique lock was not constructed with a mutex, then mutex() returns
+   * nullptr.  If the mutex is not locked, then proxy() returns nullptr
+   */
+  mutex_type* mutex() const noexcept;
+  proxy_type* proxy() const noexcept;
+
+ private:
+  friend class ProxyLockableTest;
+
+  /**
+   * If the optional has a value, the mutex is locked, if it is empty, it is
+   * not
+   */
+  mutable folly::Optional<proxy_type> proxy_{};
+  mutex_type* mutex_{nullptr};
+};
+
+template <typename Mutex>
+class ProxyLockableLockGuard : private ProxyLockableUniqueLock<Mutex> {
+ public:
+  using mutex_type = Mutex;
+
+  /**
+   * Constructor locks the mutex, and destructor unlocks
+   */
+  ProxyLockableLockGuard(mutex_type& mtx);
+  ~ProxyLockableLockGuard() = default;
+
+  /**
+   * This class is not movable or assignable
+   *
+   * For more complicated usecases, consider the UniqueLock variant, which
+   * provides more options
+   */
+  ProxyLockableLockGuard(const ProxyLockableLockGuard&) = delete;
+  ProxyLockableLockGuard(ProxyLockableLockGuard&&) = delete;
+  ProxyLockableLockGuard& operator=(ProxyLockableLockGuard&&) = delete;
+  ProxyLockableLockGuard& operator=(const ProxyLockableLockGuard&) = delete;
+};
+
+} // namespace detail
+} // namespace folly
+
+#include <folly/synchronization/detail/ProxyLockable-inl.h>
diff --git a/third-party/folly/folly/synchronization/detail/Sleeper.h b/third-party/folly/folly/synchronization/detail/Sleeper.h
new file mode 100644
index 00000000000..5bc98b3333e
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/Sleeper.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+/*
+ * @author Keith Adams <kma@fb.com>
+ * @author Jordan DeLong <delong.j@fb.com>
+ */
+
+#include <cstdint>
+#include <thread>
+
+#include <folly/portability/Asm.h>
+
+namespace folly {
+
+//////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/*
+ * A helper object for the contended case. Starts off with eager
+ * spinning, and falls back to sleeping for small quantums.
+ */
+class Sleeper {
+  static const uint32_t kMaxActiveSpin = 4000;
+
+  uint32_t spinCount;
+
+ public:
+  Sleeper() noexcept : spinCount(0) {}
+
+  static void sleep() noexcept {
+    /*
+     * Always sleep 0.5ms, assuming this will make the kernel put
+     * us down for whatever its minimum timer resolution is (in
+     * linux this varies by kernel version from 1ms to 10ms).
+     */
+    std::this_thread::sleep_for(std::chrono::microseconds{500});
+  }
+
+  void wait() noexcept {
+    if (spinCount < kMaxActiveSpin) {
+      ++spinCount;
+      asm_volatile_pause();
+    } else {
+      sleep();
+    }
+  }
+};
+
+} // namespace detail
+} // namespace folly
+
diff --git a/third-party/folly/folly/synchronization/detail/Spin.h b/third-party/folly/folly/synchronization/detail/Spin.h
new file mode 100644
index 00000000000..6eabc334e99
--- /dev/null
+++ b/third-party/folly/folly/synchronization/detail/Spin.h
@@ -0,0 +1,77 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <thread>
+
+#include <folly/portability/Asm.h>
+#include <folly/synchronization/WaitOptions.h>
+
+namespace folly {
+namespace detail {
+
+enum class spin_result {
+  success, // condition passed
+  timeout, // exceeded deadline
+  advance, // exceeded current wait-options component timeout
+};
+
+template <typename Clock, typename Duration, typename F>
+spin_result spin_pause_until(
+    std::chrono::time_point<Clock, Duration> const& deadline,
+    WaitOptions const& opt,
+    F f) {
+  if (opt.spin_max() <= opt.spin_max().zero()) {
+    return spin_result::advance;
+  }
+
+  auto tbegin = Clock::now();
+  while (true) {
+    if (f()) {
+      return spin_result::success;
+    }
+
+    auto const tnow = Clock::now();
+    if (tnow >= deadline) {
+      return spin_result::timeout;
+    }
+
+    //  Backward time discontinuity in Clock? revise pre_block starting point
+    tbegin = std::min(tbegin, tnow);
+    if (tnow >= tbegin + opt.spin_max()) {
+      return spin_result::advance;
+    }
+
+    //  The pause instruction is the polite way to spin, but it doesn't
+    //  actually affect correctness to omit it if we don't have it. Pausing
+    //  donates the full capabilities of the current core to its other
+    //  hyperthreads for a dozen cycles or so.
+    asm_volatile_pause();
+  }
+}
+
+template <typename Clock, typename Duration, typename F>
+spin_result spin_yield_until(
+    std::chrono::time_point<Clock, Duration> const& deadline,
+    F f) {
+  while (true) {
+    if (f()) {
+      return spin_result::success;
+    }
+
+    auto const max = std::chrono::time_point<Clock, Duration>::max();
+    if (deadline != max && Clock::now() >= deadline) {
+      return spin_result::timeout;
+    }
+
+    std::this_thread::yield();
+  }
+}
+
+} // namespace detail
+} // namespace folly
diff --git a/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp b/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
new file mode 100644
index 00000000000..e75d0f35c61
--- /dev/null
+++ b/third-party/folly/folly/synchronization/test/DistributedMutexTest.cpp
@@ -0,0 +1,1136 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <folly/synchronization/DistributedMutex.h>
+#include <folly/container/Array.h>
+#include <folly/synchronization/Baton.h>
+
+#ifdef OS_AIX
+#include "gtest/gtest.h"
+#else
+#include <gtest/gtest.h>
+#endif
+
+#ifndef ROCKSDB_LITE
+
+#include <chrono>
+#include <cmath>
+#include <thread>
+
+namespace folly {
+namespace test {
+template <template <typename> class Atomic>
+using TestDistributedMutex =
+    folly::detail::distributed_mutex::DistributedMutex<Atomic, false>;
+} // namespace test
+
+namespace {
+constexpr auto kStressFactor = 1000;
+constexpr auto kStressTestSeconds = 2;
+constexpr auto kForever = std::chrono::hours{100};
+
+int sum(int n) {
+  return (n * (n + 1)) / 2;
+}
+
+template <template <typename> class Atom = std::atomic>
+void basicNThreads(int numThreads, int iterations = kStressFactor) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& result = std::vector<int>{};
+
+  auto&& function = [&](int id) {
+    return [&, id] {
+      for (auto j = 0; j < iterations; ++j) {
+        auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        result.push_back(id);
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    };
+  };
+
+  for (auto i = 1; i <= numThreads; ++i) {
+    threads.push_back(std::thread(function(i)));
+  }
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  auto total = 0;
+  for (auto value : result) {
+    total += value;
+  }
+  EXPECT_EQ(total, sum(numThreads) * iterations);
+}
+
+template <template <typename> class Atom = std::atomic>
+void lockWithTryAndTimedNThreads(
+    int numThreads,
+    std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& lockUnlockFunction = [&]() {
+    while (!stop.load()) {
+      auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+      EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+      std::this_thread::yield();
+      EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+    }
+  };
+
+  auto tryLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock()) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  auto timedLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock_for(kForever)) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(lockUnlockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(tryLockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(timedLockFunction));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <template <typename> class Atom = std::atomic>
+void combineNThreads(int numThreads, std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& function = [&]() {
+    return [&] {
+      auto&& expected = std::uint64_t{0};
+      auto&& local = std::atomic<std::uint64_t>{0};
+      auto&& result = std::atomic<std::uint64_t>{0};
+      while (!stop.load()) {
+        ++expected;
+        auto current = mutex.lock_combine([&]() {
+          result.fetch_add(1);
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+          std::this_thread::yield();
+          SCOPE_EXIT {
+            EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+          };
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+          return local.fetch_add(1);
+        });
+        EXPECT_EQ(current, expected - 1);
+      }
+
+      EXPECT_EQ(expected, result.load());
+    };
+  };
+
+  for (auto i = 1; i <= numThreads; ++i) {
+    threads.push_back(std::thread(function()));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <template <typename> class Atom = std::atomic>
+void combineWithLockNThreads(int numThreads, std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& lockUnlockFunction = [&]() {
+    while (!stop.load()) {
+      auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+      EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+      std::this_thread::yield();
+      EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+    }
+  };
+
+  auto&& combineFunction = [&]() {
+    auto&& expected = std::uint64_t{0};
+    auto&& total = std::atomic<std::uint64_t>{0};
+
+    while (!stop.load()) {
+      ++expected;
+      auto current = mutex.lock_combine([&]() {
+        auto iteration = total.fetch_add(1);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+        std::this_thread::yield();
+        SCOPE_EXIT {
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+        };
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+        return iteration;
+      });
+
+      EXPECT_EQ(expected, current + 1);
+    }
+
+    EXPECT_EQ(expected, total.load());
+  };
+
+  for (auto i = 1; i < (numThreads / 2); ++i) {
+    threads.push_back(std::thread(combineFunction));
+  }
+  for (auto i = 0; i < (numThreads / 2); ++i) {
+    threads.push_back(std::thread(lockUnlockFunction));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <template <typename> class Atom = std::atomic>
+void combineWithTryLockNThreads(int numThreads, std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& lockUnlockFunction = [&]() {
+    while (!stop.load()) {
+      auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+      EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+      std::this_thread::yield();
+      EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+    }
+  };
+
+  auto&& combineFunction = [&]() {
+    auto&& expected = std::uint64_t{0};
+    auto&& total = std::atomic<std::uint64_t>{0};
+
+    while (!stop.load()) {
+      ++expected;
+      auto current = mutex.lock_combine([&]() {
+        auto iteration = total.fetch_add(1);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+        std::this_thread::yield();
+        SCOPE_EXIT {
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+        };
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+        return iteration;
+      });
+
+      EXPECT_EQ(expected, current + 1);
+    }
+
+    EXPECT_EQ(expected, total.load());
+  };
+
+  auto tryLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock()) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(lockUnlockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(combineFunction));
+  }
+  for (auto i = 0; i < (numThreads / 3); ++i) {
+    threads.push_back(std::thread(tryLockFunction));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+template <template <typename> class Atom = std::atomic>
+void combineWithLockTryAndTimedNThreads(
+    int numThreads,
+    std::chrono::seconds duration) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& barrier = std::atomic<int>{0};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+
+  auto&& lockUnlockFunction = [&]() {
+    while (!stop.load()) {
+      auto lck = std::unique_lock<_t<std::decay<decltype(mutex)>>>{mutex};
+      EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+      std::this_thread::yield();
+      EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+    }
+  };
+
+  auto&& combineFunction = [&]() {
+    auto&& expected = std::uint64_t{0};
+    auto&& total = std::atomic<std::uint64_t>{0};
+
+    while (!stop.load()) {
+      ++expected;
+      auto current = mutex.lock_combine([&]() {
+        auto iteration = total.fetch_add(1);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+        std::this_thread::yield();
+        SCOPE_EXIT {
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+        };
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+
+        // return a non-trivially-copyable object that occupies all the
+        // storage we use to coalesce returns to test that codepath
+        return folly::make_array(
+            iteration,
+            iteration + 1,
+            iteration + 2,
+            iteration + 3,
+            iteration + 4,
+            iteration + 5);
+      });
+
+      EXPECT_EQ(expected, current[0] + 1);
+      EXPECT_EQ(expected, current[1]);
+      EXPECT_EQ(expected, current[2] - 1);
+      EXPECT_EQ(expected, current[3] - 2);
+      EXPECT_EQ(expected, current[4] - 3);
+      EXPECT_EQ(expected, current[5] - 4);
+    }
+
+    EXPECT_EQ(expected, total.load());
+  };
+
+  auto tryLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock()) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  auto timedLockFunction = [&]() {
+    while (!stop.load()) {
+      using Mutex = _t<std::decay<decltype(mutex)>>;
+      auto lck = std::unique_lock<Mutex>{mutex, std::defer_lock};
+      if (lck.try_lock_for(kForever)) {
+        EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+        std::this_thread::yield();
+        EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+      }
+    }
+  };
+
+  for (auto i = 0; i < (numThreads / 4); ++i) {
+    threads.push_back(std::thread(lockUnlockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 4); ++i) {
+    threads.push_back(std::thread(combineFunction));
+  }
+  for (auto i = 0; i < (numThreads / 4); ++i) {
+    threads.push_back(std::thread(tryLockFunction));
+  }
+  for (auto i = 0; i < (numThreads / 4); ++i) {
+    threads.push_back(std::thread(timedLockFunction));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, InternalDetailTestOne) {
+  auto value = 0;
+  auto ptr = reinterpret_cast<std::uintptr_t>(&value);
+  EXPECT_EQ(folly::detail::distributed_mutex::extractPtr<int>(ptr), &value);
+  ptr = ptr | 0b1;
+  EXPECT_EQ(folly::detail::distributed_mutex::extractPtr<int>(ptr), &value);
+}
+
+TEST(DistributedMutex, Basic) {
+  auto&& mutex = folly::DistributedMutex{};
+  auto state = mutex.lock();
+  mutex.unlock(std::move(state));
+}
+
+TEST(DistributedMutex, BasicTryLock) {
+  auto&& mutex = folly::DistributedMutex{};
+
+  while (true) {
+    auto state = mutex.try_lock();
+    if (state) {
+      mutex.unlock(std::move(state));
+      break;
+    }
+  }
+}
+
+TEST(DistributedMutex, StressTwoThreads) {
+  basicNThreads(2);
+}
+TEST(DistributedMutex, StressThreeThreads) {
+  basicNThreads(3);
+}
+TEST(DistributedMutex, StressFourThreads) {
+  basicNThreads(4);
+}
+TEST(DistributedMutex, StressFiveThreads) {
+  basicNThreads(5);
+}
+TEST(DistributedMutex, StressSixThreads) {
+  basicNThreads(6);
+}
+TEST(DistributedMutex, StressSevenThreads) {
+  basicNThreads(7);
+}
+TEST(DistributedMutex, StressEightThreads) {
+  basicNThreads(8);
+}
+TEST(DistributedMutex, StressSixteenThreads) {
+  basicNThreads(16);
+}
+TEST(DistributedMutex, StressThirtyTwoThreads) {
+  basicNThreads(32);
+}
+TEST(DistributedMutex, StressSixtyFourThreads) {
+  basicNThreads(64);
+}
+TEST(DistributedMutex, StressHundredThreads) {
+  basicNThreads(100);
+}
+TEST(DistributedMutex, StressHardwareConcurrencyThreads) {
+  basicNThreads(std::thread::hardware_concurrency());
+}
+
+TEST(DistributedMutex, StressThreeThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(3, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(6, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwelveThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(12, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwentyFourThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(24, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourtyEightThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(48, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHwConcThreadsLockTryAndTimed) {
+  lockWithTryAndTimedNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressTwoThreadsCombine) {
+  combineNThreads(2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressThreeThreadsCombine) {
+  combineNThreads(3, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourThreadsCombine) {
+  combineNThreads(4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFiveThreadsCombine) {
+  combineNThreads(5, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixThreadsCombine) {
+  combineNThreads(6, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSevenThreadsCombine) {
+  combineNThreads(7, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressEightThreadsCombine) {
+  combineNThreads(8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixteenThreadsCombine) {
+  combineNThreads(16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressThirtyTwoThreadsCombine) {
+  combineNThreads(32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsCombine) {
+  combineNThreads(64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHundredThreadsCombine) {
+  combineNThreads(100, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHardwareConcurrencyThreadsCombine) {
+  combineNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressTwoThreadsCombineAndLock) {
+  combineWithLockNThreads(2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourThreadsCombineAndLock) {
+  combineWithLockNThreads(4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressEightThreadsCombineAndLock) {
+  combineWithLockNThreads(8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixteenThreadsCombineAndLock) {
+  combineWithLockNThreads(16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressThirtyTwoThreadsCombineAndLock) {
+  combineWithLockNThreads(32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsCombineAndLock) {
+  combineWithLockNThreads(64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHardwareConcurrencyThreadsCombineAndLock) {
+  combineWithLockNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressThreeThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(3, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(6, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwelveThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(12, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwentyFourThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(24, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourtyEightThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(48, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHardwareConcurrencyThreadsCombineTryLockAndLock) {
+  combineWithTryLockNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressThreeThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      3, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      6, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwelveThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      12, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressTwentyFourThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      24, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressFourtyEightThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      48, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressSixtyFourThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      64, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressHwConcurrencyThreadsCombineTryLockLockAndTimed) {
+  combineWithLockTryAndTimedNThreads(
+      std::thread::hardware_concurrency(),
+      std::chrono::seconds{kStressTestSeconds});
+}
+
+TEST(DistributedMutex, StressTryLock) {
+  auto&& mutex = folly::DistributedMutex{};
+
+  for (auto i = 0; i < kStressFactor; ++i) {
+    while (true) {
+      auto state = mutex.try_lock();
+      if (state) {
+        mutex.unlock(std::move(state));
+        break;
+      }
+    }
+  }
+}
+
+TEST(DistributedMutex, TimedLockTimeout) {
+  auto&& mutex = folly::DistributedMutex{};
+  auto&& start = folly::Baton<>{};
+  auto&& done = folly::Baton<>{};
+
+  auto thread = std::thread{[&]() {
+    auto state = mutex.lock();
+    start.post();
+    done.wait();
+    mutex.unlock(std::move(state));
+  }};
+
+  start.wait();
+  auto result = mutex.try_lock_for(std::chrono::milliseconds{10});
+  EXPECT_FALSE(result);
+  done.post();
+  thread.join();
+}
+
+TEST(DistributedMutex, TimedLockAcquireAfterUnlock) {
+  auto&& mutex = folly::DistributedMutex{};
+  auto&& start = folly::Baton<>{};
+
+  auto thread = std::thread{[&]() {
+    auto state = mutex.lock();
+    start.post();
+    /* sleep override */
+    std::this_thread::sleep_for(std::chrono::milliseconds{10});
+    mutex.unlock(std::move(state));
+  }};
+
+  start.wait();
+  auto result = mutex.try_lock_for(kForever);
+  EXPECT_TRUE(result);
+  thread.join();
+}
+
+namespace {
+template <template <typename> class Atom = std::atomic>
+void stressTryLockWithConcurrentLocks(
+    int numThreads,
+    int iterations = kStressFactor) {
+  auto&& threads = std::vector<std::thread>{};
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& atomic = std::atomic<std::uint64_t>{0};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&] {
+      for (auto j = 0; j < iterations; ++j) {
+        auto state = mutex.lock();
+        EXPECT_EQ(atomic.fetch_add(1, std::memory_order_relaxed), 0);
+        EXPECT_EQ(atomic.fetch_sub(1, std::memory_order_relaxed), 1);
+        mutex.unlock(std::move(state));
+      }
+    }));
+  }
+
+  for (auto i = 0; i < iterations; ++i) {
+    if (auto state = mutex.try_lock()) {
+      EXPECT_EQ(atomic.fetch_add(1, std::memory_order_relaxed), 0);
+      EXPECT_EQ(atomic.fetch_sub(1, std::memory_order_relaxed), 1);
+      mutex.unlock(std::move(state));
+    }
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksTwoThreads) {
+  stressTryLockWithConcurrentLocks(2);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksFourThreads) {
+  stressTryLockWithConcurrentLocks(4);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksEightThreads) {
+  stressTryLockWithConcurrentLocks(8);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksSixteenThreads) {
+  stressTryLockWithConcurrentLocks(16);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksThirtyTwoThreads) {
+  stressTryLockWithConcurrentLocks(32);
+}
+TEST(DistributedMutex, StressTryLockWithConcurrentLocksSixtyFourThreads) {
+  stressTryLockWithConcurrentLocks(64);
+}
+
+namespace {
+template <template <typename> class Atom = std::atomic>
+void concurrentTryLocks(int numThreads, int iterations = kStressFactor) {
+  auto&& threads = std::vector<std::thread>{};
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& atomic = std::atomic<std::uint64_t>{0};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&] {
+      for (auto j = 0; j < iterations; ++j) {
+        if (auto state = mutex.try_lock()) {
+          EXPECT_EQ(atomic.fetch_add(1, std::memory_order_relaxed), 0);
+          EXPECT_EQ(atomic.fetch_sub(1, std::memory_order_relaxed), 1);
+          mutex.unlock(std::move(state));
+        }
+      }
+    }));
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, StressTryLockWithTwoThreads) {
+  concurrentTryLocks(2);
+}
+TEST(DistributedMutex, StressTryLockFourThreads) {
+  concurrentTryLocks(4);
+}
+TEST(DistributedMutex, StressTryLockEightThreads) {
+  concurrentTryLocks(8);
+}
+TEST(DistributedMutex, StressTryLockSixteenThreads) {
+  concurrentTryLocks(16);
+}
+TEST(DistributedMutex, StressTryLockThirtyTwoThreads) {
+  concurrentTryLocks(32);
+}
+TEST(DistributedMutex, StressTryLockSixtyFourThreads) {
+  concurrentTryLocks(64);
+}
+
+namespace {
+class TestConstruction {
+ public:
+  TestConstruction() = delete;
+  explicit TestConstruction(int) {
+    defaultConstructs().fetch_add(1, std::memory_order_relaxed);
+  }
+  TestConstruction(TestConstruction&&) noexcept {
+    moveConstructs().fetch_add(1, std::memory_order_relaxed);
+  }
+  TestConstruction(const TestConstruction&) {
+    copyConstructs().fetch_add(1, std::memory_order_relaxed);
+  }
+  TestConstruction& operator=(const TestConstruction&) {
+    copyAssigns().fetch_add(1, std::memory_order_relaxed);
+    return *this;
+  }
+  TestConstruction& operator=(TestConstruction&&) {
+    moveAssigns().fetch_add(1, std::memory_order_relaxed);
+    return *this;
+  }
+  ~TestConstruction() {
+    destructs().fetch_add(1, std::memory_order_relaxed);
+  }
+
+  static std::atomic<std::uint64_t>& defaultConstructs() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& moveConstructs() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& copyConstructs() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& moveAssigns() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& copyAssigns() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+  static std::atomic<std::uint64_t>& destructs() {
+    static auto&& atomic = std::atomic<std::uint64_t>{0};
+    return atomic;
+  }
+
+  static void reset() {
+    defaultConstructs().store(0);
+    moveConstructs().store(0);
+    copyConstructs().store(0);
+    copyAssigns().store(0);
+    destructs().store(0);
+  }
+};
+} // namespace
+
+TEST(DistributedMutex, TestAppropriateDestructionAndConstructionWithCombine) {
+  auto&& mutex = folly::DistributedMutex{};
+  auto&& stop = std::atomic<bool>{false};
+
+  // test the simple return path to make sure that in the absence of
+  // contention, we get the right number of constructs and destructs
+  mutex.lock_combine([]() { return TestConstruction{1}; });
+  auto moves = TestConstruction::moveConstructs().load();
+  auto defaults = TestConstruction::defaultConstructs().load();
+  EXPECT_EQ(TestConstruction::defaultConstructs().load(), 1);
+  EXPECT_TRUE(moves == 0 || moves == 1);
+  EXPECT_EQ(TestConstruction::destructs().load(), moves + defaults);
+
+  // loop and make sure we were able to test the path where the critical
+  // section of the thread gets combined, and assert that we see the expected
+  // number of constructions and destructions
+  //
+  // this implements a timed backoff to test the combined path, so we use the
+  // smallest possible delay in tests
+  auto thread = std::thread{[&]() {
+    auto&& duration = std::chrono::milliseconds{10};
+    while (!stop.load()) {
+      TestConstruction::reset();
+      auto&& ready = folly::Baton<>{};
+      auto&& release = folly::Baton<>{};
+
+      // make one thread start it's critical section, signal and wait for
+      // another thread to enqueue, to test the
+      auto innerThread = std::thread{[&]() {
+        mutex.lock_combine([&]() {
+          ready.post();
+          release.wait();
+          /* sleep override */
+          std::this_thread::sleep_for(duration);
+        });
+      }};
+
+      // wait for the thread to get in its critical section, then tell it to go
+      ready.wait();
+      release.post();
+      mutex.lock_combine([&]() { return TestConstruction{1}; });
+
+      innerThread.join();
+
+      // at this point we should have only one default construct, either 3
+      // or 4 move constructs the same number of destructions as
+      // constructions
+      auto innerDefaults = TestConstruction::defaultConstructs().load();
+      auto innerMoves = TestConstruction::moveConstructs().load();
+      auto destructs = TestConstruction::destructs().load();
+      EXPECT_EQ(innerDefaults, 1);
+      EXPECT_TRUE(innerMoves == 3 || innerMoves == 4 || innerMoves == 1);
+      EXPECT_EQ(destructs, innerMoves + innerDefaults);
+      EXPECT_EQ(TestConstruction::moveAssigns().load(), 0);
+      EXPECT_EQ(TestConstruction::copyAssigns().load(), 0);
+
+      // increase duration by 100ms each iteration
+      duration = duration + std::chrono::milliseconds{100};
+    }
+  }};
+
+  /* sleep override */
+  std::this_thread::sleep_for(std::chrono::seconds{kStressTestSeconds});
+  stop.store(true);
+  thread.join();
+}
+
+namespace {
+template <template <typename> class Atom = std::atomic>
+void concurrentLocksManyMutexes(int numThreads, std::chrono::seconds duration) {
+  using DMutex = folly::detail::distributed_mutex::DistributedMutex<Atom>;
+  const auto&& kNumMutexes = 10;
+  auto&& threads = std::vector<std::thread>{};
+  auto&& mutexes = std::vector<DMutex>(kNumMutexes);
+  auto&& barriers = std::vector<std::atomic<std::uint64_t>>(kNumMutexes);
+  auto&& stop = std::atomic<bool>{false};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&] {
+      auto&& total = std::atomic<std::uint64_t>{0};
+      auto&& expected = std::uint64_t{0};
+
+      for (auto j = 0; !stop.load(std::memory_order_relaxed); ++j) {
+        auto& mutex = mutexes[j % kNumMutexes];
+        auto& barrier = barriers[j % kNumMutexes];
+
+        ++expected;
+        auto result = mutex.lock_combine([&]() {
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+          std::this_thread::yield();
+          SCOPE_EXIT {
+            EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+          };
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+          return total.fetch_add(1, std::memory_order_relaxed);
+        });
+        EXPECT_EQ(result, expected - 1);
+      }
+
+      EXPECT_EQ(total.load(), expected);
+    }));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(duration);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, StressWithManyMutexesAlternatingTwoThreads) {
+  concurrentLocksManyMutexes(2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingFourThreads) {
+  concurrentLocksManyMutexes(4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingEightThreads) {
+  concurrentLocksManyMutexes(8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingSixteenThreads) {
+  concurrentLocksManyMutexes(16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingThirtyTwoThreads) {
+  concurrentLocksManyMutexes(32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressWithManyMutexesAlternatingSixtyFourThreads) {
+  concurrentLocksManyMutexes(64, std::chrono::seconds{kStressTestSeconds});
+}
+
+namespace {
+class ExceptionWithConstructionTrack : public std::exception {
+ public:
+  explicit ExceptionWithConstructionTrack(int id)
+      : id_{std::to_string(id)}, constructionTrack_{id} {}
+
+  const char* what() const noexcept override {
+    return id_.c_str();
+  }
+
+ private:
+  std::string id_;
+  TestConstruction constructionTrack_;
+};
+} // namespace
+
+TEST(DistributedMutex, TestExceptionPropagationUncontended) {
+  TestConstruction::reset();
+  auto&& mutex = folly::DistributedMutex{};
+  auto&& thread = std::thread{[&]() {
+    try {
+      mutex.lock_combine([&]() { throw ExceptionWithConstructionTrack{46}; });
+    } catch (std::exception& exc) {
+      auto integer = std::stoi(exc.what());
+      EXPECT_EQ(integer, 46);
+      EXPECT_GT(TestConstruction::defaultConstructs(), 0);
+    }
+    EXPECT_EQ(
+        TestConstruction::defaultConstructs(), TestConstruction::destructs());
+  }};
+  thread.join();
+}
+
+namespace {
+template <template <typename> class Atom = std::atomic>
+void concurrentExceptionPropagationStress(
+    int numThreads,
+    std::chrono::milliseconds t) {
+  // this test fails under with a false negative under older versions of TSAN
+  // for some reason so disable it when TSAN is enabled
+  if (folly::kIsSanitizeThread) {
+    return;
+  }
+
+  TestConstruction::reset();
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+  auto&& barrier = std::atomic<std::uint64_t>{0};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&]() {
+      for (auto j = 0; !stop.load(); ++j) {
+        auto value = int{0};
+        try {
+          value = mutex.lock_combine([&]() {
+            EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+            EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+            std::this_thread::yield();
+            SCOPE_EXIT {
+              EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+            };
+            EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+
+            // we only throw an exception once every 3 times
+            if (!(j % 3)) {
+              throw ExceptionWithConstructionTrack{j};
+            }
+
+            return j;
+          });
+        } catch (std::exception& exc) {
+          value = std::stoi(exc.what());
+        }
+
+        EXPECT_EQ(value, j);
+      }
+    }));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(t);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, TestExceptionPropagationStressTwoThreads) {
+  concurrentExceptionPropagationStress(
+      2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressFourThreads) {
+  concurrentExceptionPropagationStress(
+      4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressEightThreads) {
+  concurrentExceptionPropagationStress(
+      8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressSixteenThreads) {
+  concurrentExceptionPropagationStress(
+      16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressThirtyTwoThreads) {
+  concurrentExceptionPropagationStress(
+      32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, TestExceptionPropagationStressSixtyFourThreads) {
+  concurrentExceptionPropagationStress(
+      64, std::chrono::seconds{kStressTestSeconds});
+}
+
+namespace {
+std::array<std::uint64_t, 8> makeMonotonicArray(int start) {
+  auto array = std::array<std::uint64_t, 8>{};
+  for (auto& element : array) { element = start++; }
+  return array;
+}
+
+template <template <typename> class Atom = std::atomic>
+void concurrentBigValueReturnStress(
+    int numThreads,
+    std::chrono::milliseconds t) {
+  auto&& mutex = folly::detail::distributed_mutex::DistributedMutex<Atom>{};
+  auto&& threads = std::vector<std::thread>{};
+  auto&& stop = std::atomic<bool>{false};
+  auto&& barrier = std::atomic<std::uint64_t>{0};
+
+  for (auto i = 0; i < numThreads; ++i) {
+    threads.push_back(std::thread([&]() {
+      auto&& value = std::atomic<std::uint64_t>{0};
+
+      for (auto j = 0; !stop.load(); ++j) {
+        auto returned = mutex.lock_combine([&]() {
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 0);
+          EXPECT_EQ(barrier.fetch_add(1, std::memory_order_relaxed), 1);
+          std::this_thread::yield();
+          // return an entire cacheline worth of data
+          auto current = value.fetch_add(1, std::memory_order_relaxed);
+          SCOPE_EXIT {
+            EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 1);
+          };
+          EXPECT_EQ(barrier.fetch_sub(1, std::memory_order_relaxed), 2);
+          return makeMonotonicArray(static_cast<int>(current));
+        });
+
+        auto expected = value.load() - 1;
+        for (auto& element : returned) {
+          EXPECT_EQ(element, expected++);
+        }
+      }
+    }));
+  }
+
+  /* sleep override */
+  std::this_thread::sleep_for(t);
+  stop.store(true);
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+} // namespace
+
+TEST(DistributedMutex, StressBigValueReturnTwoThreads) {
+  concurrentBigValueReturnStress(2, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnFourThreads) {
+  concurrentBigValueReturnStress(4, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnEightThreads) {
+  concurrentBigValueReturnStress(8, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnSixteenThreads) {
+  concurrentBigValueReturnStress(16, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnThirtyTwoThreads) {
+  concurrentBigValueReturnStress(32, std::chrono::seconds{kStressTestSeconds});
+}
+TEST(DistributedMutex, StressBigValueReturnSixtyFourThreads) {
+  concurrentBigValueReturnStress(64, std::chrono::seconds{kStressTestSeconds});
+}
+
+} // namespace folly
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/third-party/gtest-1.7.0/fused-src/gtest/CMakeLists.txt b/third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt
similarity index 100%
rename from third-party/gtest-1.7.0/fused-src/gtest/CMakeLists.txt
rename to third-party/gtest-1.8.1/fused-src/gtest/CMakeLists.txt
diff --git a/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
similarity index 85%
rename from third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc
rename to third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
index 7c5217202f8..9f2b3d56530 100644
--- a/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
@@ -26,17 +26,13 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: mheule@google.com (Markus Heule)
-//
-// Google C++ Testing Framework (Google Test)
+// Google C++ Testing and Mocking Framework (Google Test)
 //
 // Sometimes it's desirable to build Google Test by compiling a single file.
 // This file serves this purpose.
 
-// Suppress clang analyzer warnings.
-#ifndef __clang_analyzer__
-
 // This line ensures that gtest.h can be compiled on its own, even
 // when it's fused.
 #include "gtest/gtest.h"
@@ -70,10 +66,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 
 // Copyright 2007, Google Inc.
 // All rights reserved.
@@ -103,15 +98,20 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 //
 // Utilities for testing Google Test itself and code that uses Google Test
 // (e.g. frameworks built on top of Google Test).
 
+// GOOGLETEST_CM0004 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
 #define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
 
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // This helper class can be used to mock out Google Test failure reporting
@@ -172,13 +172,12 @@ class GTEST_API_ SingleFailureChecker {
  public:
   // The constructor remembers the arguments.
   SingleFailureChecker(const TestPartResultArray* results,
-                       TestPartResult::Type type,
-                       const string& substr);
+                       TestPartResult::Type type, const std::string& substr);
   ~SingleFailureChecker();
  private:
   const TestPartResultArray* const results_;
   const TestPartResult::Type type_;
-  const string substr_;
+  const std::string substr_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
 };
@@ -187,6 +186,8 @@ class GTEST_API_ SingleFailureChecker {
 
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // A set of macros for testing Google Test assertions or code that's expected
 // to generate Google Test fatal failures.  It verifies that the given
 // statement will cause exactly one fatal Google Test failure with 'substr'
@@ -326,7 +327,7 @@ class GTEST_API_ SingleFailureChecker {
 
 #if GTEST_OS_LINUX
 
-// TODO(kenton@google.com): Use autoconf to detect availability of
+// FIXME: Use autoconf to detect availability of
 // gettimeofday().
 # define GTEST_HAS_GETTIMEOFDAY_ 1
 
@@ -365,9 +366,9 @@ class GTEST_API_ SingleFailureChecker {
 
 # if GTEST_OS_WINDOWS_MINGW
 // MinGW has gettimeofday() but not _ftime64().
-// TODO(kenton@google.com): Use autoconf to detect availability of
+// FIXME: Use autoconf to detect availability of
 //   gettimeofday().
-// TODO(kenton@google.com): There are other ways to get the time on
+// FIXME: There are other ways to get the time on
 //   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
 //   supports these.  consider using them instead.
 #  define GTEST_HAS_GETTIMEOFDAY_ 1
@@ -382,7 +383,7 @@ class GTEST_API_ SingleFailureChecker {
 #else
 
 // Assume other platforms have gettimeofday().
-// TODO(kenton@google.com): Use autoconf to detect availability of
+// FIXME: Use autoconf to detect availability of
 //   gettimeofday().
 # define GTEST_HAS_GETTIMEOFDAY_ 1
 
@@ -404,12 +405,6 @@ class GTEST_API_ SingleFailureChecker {
 # include <sys/types.h>  // NOLINT
 #endif
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
 // Copyright 2005, Google Inc.
 // All rights reserved.
 //
@@ -439,24 +434,13 @@ class GTEST_API_ SingleFailureChecker {
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// Utility functions and classes used by the Google C++ testing framework.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
+// Utility functions and classes used by the Google C++ testing framework.//
 // This file contains purely Google Test's internal implementation.  Please
 // DO NOT #INCLUDE IT IN A USER PROGRAM.
 
 #ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
 #define GTEST_SRC_GTEST_INTERNAL_INL_H_
 
-// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
-// part of Google Test's implementation; otherwise it's undefined.
-#if !GTEST_IMPLEMENTATION_
-// If this file is included from the user's code, just say no.
-# error "gtest-internal-inl.h is part of Google Test's internal implementation."
-# error "It must not be included except by Google Test itself."
-#endif  // GTEST_IMPLEMENTATION_
-
 #ifndef _WIN32_WCE
 # include <errno.h>
 #endif  // !_WIN32_WCE
@@ -479,6 +463,9 @@ class GTEST_API_ SingleFailureChecker {
 #endif  // GTEST_OS_WINDOWS
 
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // Declares the flags.
@@ -503,12 +490,14 @@ const char kFilterFlag[] = "filter";
 const char kListTestsFlag[] = "list_tests";
 const char kOutputFlag[] = "output";
 const char kPrintTimeFlag[] = "print_time";
+const char kPrintUTF8Flag[] = "print_utf8";
 const char kRandomSeedFlag[] = "random_seed";
 const char kRepeatFlag[] = "repeat";
 const char kShuffleFlag[] = "shuffle";
 const char kStackTraceDepthFlag[] = "stack_trace_depth";
 const char kStreamResultToFlag[] = "stream_result_to";
 const char kThrowOnFailureFlag[] = "throw_on_failure";
+const char kFlagfileFlag[] = "flagfile";
 
 // A valid random seed must be in [1, kMaxRandomSeed].
 const int kMaxRandomSeed = 99999;
@@ -582,6 +571,7 @@ class GTestFlagSaver {
     list_tests_ = GTEST_FLAG(list_tests);
     output_ = GTEST_FLAG(output);
     print_time_ = GTEST_FLAG(print_time);
+    print_utf8_ = GTEST_FLAG(print_utf8);
     random_seed_ = GTEST_FLAG(random_seed);
     repeat_ = GTEST_FLAG(repeat);
     shuffle_ = GTEST_FLAG(shuffle);
@@ -603,6 +593,7 @@ class GTestFlagSaver {
     GTEST_FLAG(list_tests) = list_tests_;
     GTEST_FLAG(output) = output_;
     GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(print_utf8) = print_utf8_;
     GTEST_FLAG(random_seed) = random_seed_;
     GTEST_FLAG(repeat) = repeat_;
     GTEST_FLAG(shuffle) = shuffle_;
@@ -624,6 +615,7 @@ class GTestFlagSaver {
   bool list_tests_;
   std::string output_;
   bool print_time_;
+  bool print_utf8_;
   internal::Int32 random_seed_;
   internal::Int32 repeat_;
   bool shuffle_;
@@ -673,7 +665,7 @@ GTEST_API_ bool ShouldShard(const char* total_shards_str,
 
 // Parses the environment variable var as an Int32. If it is unset,
 // returns default_val. If it is not an Int32, prints an error and
-// aborts.
+// and aborts.
 GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
 
 // Given the total number of shards, the shard index, and the test id,
@@ -834,13 +826,17 @@ class OsStackTraceGetterInterface {
   //                in the trace.
   //   skip_count - the number of top frames to be skipped; doesn't count
   //                against max_depth.
-  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
 
   // UponLeavingGTest() should be called immediately before Google Test calls
   // user code. It saves some information about the current stack that
   // CurrentStackTrace() will use to find and hide Google Test stack frames.
   virtual void UponLeavingGTest() = 0;
 
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
  private:
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
 };
@@ -848,25 +844,21 @@ class OsStackTraceGetterInterface {
 // A working implementation of the OsStackTraceGetterInterface interface.
 class OsStackTraceGetter : public OsStackTraceGetterInterface {
  public:
-  OsStackTraceGetter() : caller_frame_(NULL) {}
+  OsStackTraceGetter() {}
 
-  virtual string CurrentStackTrace(int max_depth, int skip_count)
-      GTEST_LOCK_EXCLUDED_(mutex_);
-
-  virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_);
-
-  // This string is inserted in place of stack frames that are part of
-  // Google Test's implementation.
-  static const char* const kElidedFramesMarker;
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count);
+  virtual void UponLeavingGTest();
 
  private:
-  Mutex mutex_;  // protects all internal state
+#if GTEST_HAS_ABSL
+  Mutex mutex_;  // Protects all internal state.
 
   // We save the stack frame below the frame that calls user code.
   // We do this because the address of the frame immediately below
   // the user code changes between the call to UponLeavingGTest()
-  // and any calls to CurrentStackTrace() from within the user code.
-  void* caller_frame_;
+  // and any calls to the stack trace code from within the user code.
+  void* caller_frame_ = nullptr;
+#endif  // GTEST_HAS_ABSL
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
 };
@@ -1082,13 +1074,11 @@ class GTEST_API_ UnitTestImpl {
                 tear_down_tc)->AddTestInfo(test_info);
   }
 
-#if GTEST_HAS_PARAM_TEST
   // Returns ParameterizedTestCaseRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
   internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
     return parameterized_test_registry_;
   }
-#endif  // GTEST_HAS_PARAM_TEST
 
   // Sets the TestCase object for the test that's currently running.
   void set_current_test_case(TestCase* a_current_test_case) {
@@ -1263,14 +1253,12 @@ class GTEST_API_ UnitTestImpl {
   // shuffled order.
   std::vector<int> test_case_indices_;
 
-#if GTEST_HAS_PARAM_TEST
   // ParameterizedTestRegistry object used to register value-parameterized
   // tests.
   internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
 
   // Indicates whether RegisterParameterizedTests() has been called already.
   bool parameterized_tests_registered_;
-#endif  // GTEST_HAS_PARAM_TEST
 
   // Index of the last death test case registered.  Initially -1.
   int last_death_test_case_;
@@ -1410,7 +1398,7 @@ bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
 
   const bool parse_success = *end == '\0' && errno == 0;
 
-  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // FIXME: Convert this to compile time assertion when it is
   // available.
   GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
 
@@ -1458,21 +1446,19 @@ class StreamingListener : public EmptyTestEventListener {
     virtual ~AbstractSocketWriter() {}
 
     // Sends a string to the socket.
-    virtual void Send(const string& message) = 0;
+    virtual void Send(const std::string& message) = 0;
 
     // Closes the socket.
     virtual void CloseConnection() {}
 
     // Sends a string and a newline to the socket.
-    void SendLn(const string& message) {
-      Send(message + "\n");
-    }
+    void SendLn(const std::string& message) { Send(message + "\n"); }
   };
 
   // Concrete class for actually writing strings to a socket.
   class SocketWriter : public AbstractSocketWriter {
    public:
-    SocketWriter(const string& host, const string& port)
+    SocketWriter(const std::string& host, const std::string& port)
         : sockfd_(-1), host_name_(host), port_num_(port) {
       MakeConnection();
     }
@@ -1483,7 +1469,7 @@ class StreamingListener : public EmptyTestEventListener {
     }
 
     // Sends a string to the socket.
-    virtual void Send(const string& message) {
+    virtual void Send(const std::string& message) {
       GTEST_CHECK_(sockfd_ != -1)
           << "Send() can be called only when there is a connection.";
 
@@ -1509,17 +1495,19 @@ class StreamingListener : public EmptyTestEventListener {
     }
 
     int sockfd_;  // socket file descriptor
-    const string host_name_;
-    const string port_num_;
+    const std::string host_name_;
+    const std::string port_num_;
 
     GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
   };  // class SocketWriter
 
   // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static string UrlEncode(const char* str);
+  static std::string UrlEncode(const char* str);
 
-  StreamingListener(const string& host, const string& port)
-      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+  StreamingListener(const std::string& host, const std::string& port)
+      : socket_writer_(new SocketWriter(host, port)) {
+    Start();
+  }
 
   explicit StreamingListener(AbstractSocketWriter* socket_writer)
       : socket_writer_(socket_writer) { Start(); }
@@ -1580,13 +1568,13 @@ class StreamingListener : public EmptyTestEventListener {
 
  private:
   // Sends the given message and a newline to the socket.
-  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+  void SendLn(const std::string& message) { socket_writer_->SendLn(message); }
 
   // Called at the start of streaming to notify the receiver what
   // protocol we are using.
   void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
 
-  string FormatBool(bool value) { return value ? "1" : "0"; }
+  std::string FormatBool(bool value) { return value ? "1" : "0"; }
 
   const scoped_ptr<AbstractSocketWriter> socket_writer_;
 
@@ -1598,13 +1586,27 @@ class StreamingListener : public EmptyTestEventListener {
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
-#undef GTEST_IMPLEMENTATION_
 
 #if GTEST_OS_WINDOWS
 # define vsnprintf _vsnprintf
 #endif  // GTEST_OS_WINDOWS
 
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+#include <crt_externs.h>
+#endif
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/debugging/failure_signal_handler.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/strings/str_cat.h"
+#endif  // GTEST_HAS_ABSL
+
 namespace testing {
 
 using internal::CountIf;
@@ -1626,8 +1628,10 @@ static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
 // A test filter that matches everything.
 static const char kUniversalFilter[] = "*";
 
-// The default output file for XML output.
-static const char kDefaultOutputFile[] = "test_detail.xml";
+// The default output format.
+static const char kDefaultOutputFormat[] = "xml";
+// The default output file.
+static const char kDefaultOutputFile[] = "test_detail";
 
 // The environment variable name for the test shard index.
 static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
@@ -1646,9 +1650,31 @@ const char kStackTraceMarker[] = "\nStack trace:\n";
 // specified on the command line.
 bool g_help_flag = false;
 
+// Utilty function to Open File for Writing
+static FILE* OpenFileForWriting(const std::string& output_file) {
+  FILE* fileout = NULL;
+  FilePath output_file_path(output_file);
+  FilePath output_dir(output_file_path.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    fileout = posix::FOpen(output_file.c_str(), "w");
+  }
+  if (fileout == NULL) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
+  }
+  return fileout;
+}
+
 }  // namespace internal
 
+// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
+// environment variable.
 static const char* GetDefaultFilter() {
+  const char* const testbridge_test_only =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
+  if (testbridge_test_only != NULL) {
+    return testbridge_test_only;
+  }
   return kUniversalFilter;
 }
 
@@ -1685,15 +1711,28 @@ GTEST_DEFINE_string_(
     "exclude).  A test is run if it matches one of the positive "
     "patterns and does not match any of the negative patterns.");
 
+GTEST_DEFINE_bool_(
+    install_failure_signal_handler,
+    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
+    "If true and supported on the current platform, " GTEST_NAME_ " should "
+    "install a signal handler that dumps debugging information when fatal "
+    "signals are raised.");
+
 GTEST_DEFINE_bool_(list_tests, false,
                    "List all tests without running them.");
 
+// The net priority order after flag processing is thus:
+//   --gtest_output command line flag
+//   GTEST_OUTPUT environment variable
+//   XML_OUTPUT_FILE environment variable
+//   ''
 GTEST_DEFINE_string_(
     output,
-    internal::StringFromGTestEnv("output", ""),
-    "A format (currently must be \"xml\"), optionally followed "
-    "by a colon and an output file name or directory. A directory "
-    "is indicated by a trailing pathname separator. "
+    internal::StringFromGTestEnv("output",
+      internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    "A format (defaults to \"xml\" but can be specified to be \"json\"), "
+    "optionally followed by a colon and an output file name or directory. "
+    "A directory is indicated by a trailing pathname separator. "
     "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
     "If a directory is specified, output files will be created "
     "within that directory, with file-names based on the test "
@@ -1706,6 +1745,12 @@ GTEST_DEFINE_bool_(
     "True iff " GTEST_NAME_
     " should display elapsed time in text output.");
 
+GTEST_DEFINE_bool_(
+    print_utf8,
+    internal::BoolFromGTestEnv("print_utf8", true),
+    "True iff " GTEST_NAME_
+    " prints UTF8 characters as text.");
+
 GTEST_DEFINE_int32_(
     random_seed,
     internal::Int32FromGTestEnv("random_seed", 0),
@@ -1747,7 +1792,14 @@ GTEST_DEFINE_bool_(
     internal::BoolFromGTestEnv("throw_on_failure", false),
     "When this flag is specified, a failed assertion will throw an exception "
     "if exceptions are enabled or exit the program with a non-zero code "
-    "otherwise.");
+    "otherwise. For use with an external test framework.");
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+    flagfile,
+    internal::StringFromGTestEnv("flagfile", ""),
+    "This flag specifies the flagfile to read command-line flags from.");
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
 namespace internal {
 
@@ -1756,7 +1808,8 @@ namespace internal {
 // than kMaxRange.
 UInt32 Random::Generate(UInt32 range) {
   // These constants are the same as are used in glibc's rand(3).
-  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+  // Use wider types than necessary to prevent unsigned overflow diagnostics.
+  state_ = static_cast<UInt32>(1103515245ULL*state_ + 12345U) % kMaxRange;
 
   GTEST_CHECK_(range > 0)
       << "Cannot generate a number in the range [0, 0).";
@@ -1773,13 +1826,7 @@ UInt32 Random::Generate(UInt32 range) {
 // GTestIsInitialized() returns true iff the user has initialized
 // Google Test.  Useful for catching the user mistake of not initializing
 // Google Test before calling RUN_ALL_TESTS().
-//
-// A user must call testing::InitGoogleTest() to initialize Google
-// Test.  g_init_gtest_count is set to the number of times
-// InitGoogleTest() has been called.  We don't protect this variable
-// under a mutex as it is only accessed in the main thread.
-GTEST_API_ int g_init_gtest_count = 0;
-static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
 
 // Iterates over a vector of TestCases, keeping a running sum of the
 // results of calling a given int-returning method on each.
@@ -1835,8 +1882,19 @@ void AssertHelper::operator=(const Message& message) const {
 // Mutex for linked pointers.
 GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
 
-// Application pathname gotten in InitGoogleTest.
-std::string g_executable_path;
+// A copy of all command line arguments.  Set by InitGoogleTest().
+static ::std::vector<std::string> g_argvs;
+
+::std::vector<std::string> GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
+  // ::string. This code converts it to the appropriate type.
+  const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
+  return ::std::vector<std::string>(custom.begin(), custom.end());
+#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
+  return g_argvs;
+#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
 
 // Returns the current application's name, removing directory path if that
 // is present.
@@ -1844,9 +1902,9 @@ FilePath GetCurrentExecutableName() {
   FilePath result;
 
 #if GTEST_OS_WINDOWS
-  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
+  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
 #else
-  result.Set(FilePath(g_executable_path));
+  result.Set(FilePath(GetArgvs()[0]));
 #endif  // GTEST_OS_WINDOWS
 
   return result.RemoveDirectoryName();
@@ -1857,8 +1915,6 @@ FilePath GetCurrentExecutableName() {
 // Returns the output format, or "" for normal printed output.
 std::string UnitTestOptions::GetOutputFormat() {
   const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL) return std::string("");
-
   const char* const colon = strchr(gtest_output_flag, ':');
   return (colon == NULL) ?
       std::string(gtest_output_flag) :
@@ -1869,19 +1925,22 @@ std::string UnitTestOptions::GetOutputFormat() {
 // was explicitly specified.
 std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
   const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL)
-    return "";
+
+  std::string format = GetOutputFormat();
+  if (format.empty())
+    format = std::string(kDefaultOutputFormat);
 
   const char* const colon = strchr(gtest_output_flag, ':');
   if (colon == NULL)
-    return internal::FilePath::ConcatPaths(
+    return internal::FilePath::MakeFileName(
         internal::FilePath(
             UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile)).string();
+        internal::FilePath(kDefaultOutputFile), 0,
+        format.c_str()).string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
-    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // FIXME: on Windows \some\path is not an absolute
     // path (as its meaning depends on the current drive), yet the
     // following logic for turning it into an absolute path is wrong.
     // Fix it.
@@ -2072,12 +2131,12 @@ extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
 // This predicate-formatter checks that 'results' contains a test part
 // failure of the given type and that the failure message contains the
 // given substring.
-AssertionResult HasOneFailure(const char* /* results_expr */,
-                              const char* /* type_expr */,
-                              const char* /* substr_expr */,
-                              const TestPartResultArray& results,
-                              TestPartResult::Type type,
-                              const string& substr) {
+static AssertionResult HasOneFailure(const char* /* results_expr */,
+                                     const char* /* type_expr */,
+                                     const char* /* substr_expr */,
+                                     const TestPartResultArray& results,
+                                     TestPartResult::Type type,
+                                     const std::string& substr) {
   const std::string expected(type == TestPartResult::kFatalFailure ?
                         "1 fatal failure" :
                         "1 non-fatal failure");
@@ -2111,13 +2170,10 @@ AssertionResult HasOneFailure(const char* /* results_expr */,
 // The constructor of SingleFailureChecker remembers where to look up
 // test part results, what type of failure we expect, and what
 // substring the failure message should contain.
-SingleFailureChecker:: SingleFailureChecker(
-    const TestPartResultArray* results,
-    TestPartResult::Type type,
-    const string& substr)
-    : results_(results),
-      type_(type),
-      substr_(substr) {}
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results,
+                                           TestPartResult::Type type,
+                                           const std::string& substr)
+    : results_(results), type_(type), substr_(substr) {}
 
 // The destructor of SingleFailureChecker verifies that the given
 // TestPartResultArray contains exactly one failure that has the given
@@ -2238,8 +2294,12 @@ int UnitTestImpl::test_to_run_count() const {
 // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
 // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
 std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
-  (void)skip_count;
-  return "";
+  return os_stack_trace_getter()->CurrentStackTrace(
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
+      skip_count + 1
+      // Skips the user-specified number of frames plus this function
+      // itself.
+      );  // NOLINT
 }
 
 // Returns the current time in milliseconds.
@@ -2254,7 +2314,7 @@ TimeInMillis GetTimeInMillis() {
   SYSTEMTIME now_systime;
   FILETIME now_filetime;
   ULARGE_INTEGER now_int64;
-  // TODO(kenton@google.com): Shouldn't this just use
+  // FIXME: Shouldn't this just use
   //   GetSystemTimeAsFileTime()?
   GetSystemTime(&now_systime);
   if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
@@ -2270,11 +2330,11 @@ TimeInMillis GetTimeInMillis() {
 
   // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
   // (deprecated function) there.
-  // TODO(kenton@google.com): Use GetTickCount()?  Or use
+  // FIXME: Use GetTickCount()?  Or use
   //   SystemTimeToFileTime()
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
   _ftime64(&now);
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
+  GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
   return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
 #elif GTEST_HAS_GETTIMEOFDAY_
@@ -2359,6 +2419,23 @@ static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
 
 #endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
 
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
 }  // namespace internal
 
 // Constructs an empty Message.
@@ -2724,41 +2801,42 @@ std::vector<std::string> SplitEscapedString(const std::string& str) {
 // and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
 // where foo is 5 and bar is 6, we have:
 //
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
+//   lhs_expression: "foo"
+//   rhs_expression: "bar"
+//   lhs_value:      "5"
+//   rhs_value:      "6"
 //
 // The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
 // be inserted into the message.
-AssertionResult EqFailure(const char* expected_expression,
-                          const char* actual_expression,
-                          const std::string& expected_value,
-                          const std::string& actual_value,
+AssertionResult EqFailure(const char* lhs_expression,
+                          const char* rhs_expression,
+                          const std::string& lhs_value,
+                          const std::string& rhs_value,
                           bool ignoring_case) {
   Message msg;
-  msg << "Value of: " << actual_expression;
-  if (actual_value != actual_expression) {
-    msg << "\n  Actual: " << actual_value;
+  msg << "Expected equality of these values:";
+  msg << "\n  " << lhs_expression;
+  if (lhs_value != lhs_expression) {
+    msg << "\n    Which is: " << lhs_value;
+  }
+  msg << "\n  " << rhs_expression;
+  if (rhs_value != rhs_expression) {
+    msg << "\n    Which is: " << rhs_value;
   }
 
-  msg << "\nExpected: " << expected_expression;
   if (ignoring_case) {
-    msg << " (ignoring case)";
-  }
-  if (expected_value != expected_expression) {
-    msg << "\nWhich is: " << expected_value;
+    msg << "\nIgnoring case";
   }
 
-  if (!expected_value.empty() && !actual_value.empty()) {
-    const std::vector<std::string> expected_lines =
-        SplitEscapedString(expected_value);
-    const std::vector<std::string> actual_lines =
-        SplitEscapedString(actual_value);
-    if (expected_lines.size() > 1 || actual_lines.size() > 1) {
+  if (!lhs_value.empty() && !rhs_value.empty()) {
+    const std::vector<std::string> lhs_lines =
+        SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines =
+        SplitEscapedString(rhs_value);
+    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
       msg << "\nWith diff:\n"
-          << edit_distance::CreateUnifiedDiff(expected_lines, actual_lines);
+          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
     }
   }
 
@@ -2791,7 +2869,7 @@ AssertionResult DoubleNearPredFormat(const char* expr1,
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
-  // TODO(wan): do not print the value of an expression if it's
+  // FIXME: do not print the value of an expression if it's
   // already a literal.
   return AssertionFailure()
       << "The difference between " << expr1 << " and " << expr2
@@ -2857,18 +2935,18 @@ namespace internal {
 
 // The helper function for {ASSERT|EXPECT}_EQ with int or enum
 // arguments.
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            BiggestInt expected,
-                            BiggestInt actual) {
-  if (expected == actual) {
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            BiggestInt lhs,
+                            BiggestInt rhs) {
+  if (lhs == rhs) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
                    false);
 }
 
@@ -2907,34 +2985,34 @@ GTEST_IMPL_CMP_HELPER_(GT, > )
 #undef GTEST_IMPL_CMP_HELPER_
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const char* expected,
-                               const char* actual) {
-  if (String::CStringEquals(expected, actual)) {
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const char* lhs,
+                               const char* rhs) {
+  if (String::CStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
                    false);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                   const char* actual_expression,
-                                   const char* expected,
-                                   const char* actual) {
-  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const char* lhs,
+                                   const char* rhs) {
+  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
                    true);
 }
 
@@ -3086,7 +3164,7 @@ namespace {
 AssertionResult HRESULTFailureHelper(const char* expr,
                                      const char* expected,
                                      long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
 
   // Windows CE doesn't support FormatMessage.
   const char error_text[] = "";
@@ -3143,7 +3221,7 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 // Utility functions for encoding Unicode text (wide strings) in
 // UTF-8.
 
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
 // like this:
 //
 // Code-point length   Encoding
@@ -3289,18 +3367,18 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
 }
 
 // Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const wchar_t* expected,
-                               const wchar_t* actual) {
-  if (String::WideCStringEquals(expected, actual)) {
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const wchar_t* lhs,
+                               const wchar_t* rhs) {
+  if (String::WideCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   PrintToString(expected),
-                   PrintToString(actual),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
                    false);
 }
 
@@ -3519,13 +3597,8 @@ static const char* const kReservedTestSuiteAttributes[] = {
 
 // The list of reserved attributes used in the <testcase> element of XML output.
 static const char* const kReservedTestCaseAttributes[] = {
-  "classname",
-  "name",
-  "status",
-  "time",
-  "type_param",
-  "value_param"
-};
+    "classname",  "name",        "status", "time",
+    "type_param", "value_param", "file",   "line"};
 
 template <int kSize>
 std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
@@ -3561,8 +3634,9 @@ static std::string FormatWordList(const std::vector<std::string>& words) {
   return word_list.GetString();
 }
 
-bool ValidateTestPropertyName(const std::string& property_name,
-                              const std::vector<std::string>& reserved_names) {
+static bool ValidateTestPropertyName(
+    const std::string& property_name,
+    const std::vector<std::string>& reserved_names) {
   if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
           reserved_names.end()) {
     ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
@@ -3633,14 +3707,15 @@ int TestResult::test_property_count() const {
 
 // Creates a Test object.
 
-// The c'tor saves the values of all Google Test flags.
+// The c'tor saves the states of all flags.
 Test::Test()
-    : gtest_flag_saver_(new internal::GTestFlagSaver) {
+    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
 }
 
-// The d'tor restores the values of all Google Test flags.
+// The d'tor restores the states of all flags.  The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
 Test::~Test() {
-  delete gtest_flag_saver_;
 }
 
 // Sets up the test fixture.
@@ -3858,6 +3933,8 @@ Result HandleExceptionsInMethodIfSupported(
 #if GTEST_HAS_EXCEPTIONS
     try {
       return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const AssertionException&) {  // NOLINT
+      // This failure was reported already.
     } catch (const internal::GoogleTestFailureException&) {  // NOLINT
       // This exception type can only be thrown by a failed Google
       // Test assertion with the intention of letting another testing
@@ -3924,12 +4001,14 @@ TestInfo::TestInfo(const std::string& a_test_case_name,
                    const std::string& a_name,
                    const char* a_type_param,
                    const char* a_value_param,
+                   internal::CodeLocation a_code_location,
                    internal::TypeId fixture_class_id,
                    internal::TestFactoryBase* factory)
     : test_case_name_(a_test_case_name),
       name_(a_name),
       type_param_(a_type_param ? new std::string(a_type_param) : NULL),
       value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      location_(a_code_location),
       fixture_class_id_(fixture_class_id),
       should_run_(false),
       is_disabled_(false),
@@ -3953,6 +4032,7 @@ namespace internal {
 //                     this is not a typed or a type-parameterized test.
 //   value_param:      text representation of the test's value parameter,
 //                     or NULL if this is not a value-parameterized test.
+//   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
 //   set_up_tc:        pointer to the function that sets up the test case
 //   tear_down_tc:     pointer to the function that tears down the test case
@@ -3964,20 +4044,20 @@ TestInfo* MakeAndRegisterTestInfo(
     const char* name,
     const char* type_param,
     const char* value_param,
+    CodeLocation code_location,
     TypeId fixture_class_id,
     SetUpTestCaseFunc set_up_tc,
     TearDownTestCaseFunc tear_down_tc,
     TestFactoryBase* factory) {
   TestInfo* const test_info =
       new TestInfo(test_case_name, name, type_param, value_param,
-                   fixture_class_id, factory);
+                   code_location, fixture_class_id, factory);
   GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
   return test_info;
 }
 
-#if GTEST_HAS_PARAM_TEST
 void ReportInvalidTestCaseType(const char* test_case_name,
-                               const char* file, int line) {
+                               CodeLocation code_location) {
   Message errors;
   errors
       << "Attempted redefinition of test case " << test_case_name << ".\n"
@@ -3989,11 +4069,10 @@ void ReportInvalidTestCaseType(const char* test_case_name,
       << "probably rename one of the classes to put the tests into different\n"
       << "test cases.";
 
-  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-          errors.GetString().c_str());
+  GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
+                                          code_location.line)
+                    << " " << errors.GetString();
 }
-#endif  // GTEST_HAS_PARAM_TEST
-
 }  // namespace internal
 
 namespace {
@@ -4031,12 +4110,10 @@ namespace internal {
 // and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
 // This will be done just once during the program runtime.
 void UnitTestImpl::RegisterParameterizedTests() {
-#if GTEST_HAS_PARAM_TEST
   if (!parameterized_tests_registered_) {
     parameterized_test_registry_.RegisterTests();
     parameterized_tests_registered_ = true;
   }
-#endif
 }
 
 }  // namespace internal
@@ -4064,18 +4141,18 @@ void TestInfo::Run() {
       factory_, &internal::TestFactoryBase::CreateTest,
       "the test fixture's constructor");
 
-  // Runs the test only if the test object was created and its
-  // constructor didn't generate a fatal failure.
-  if ((test != NULL) && !Test::HasFatalFailure()) {
+  // Runs the test if the constructor didn't generate a fatal failure.
+  // Note that the object will not be null
+  if (!Test::HasFatalFailure()) {
     // This doesn't throw as all user code that can throw are wrapped into
     // exception handling code.
     test->Run();
   }
 
-  // Deletes the test object.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      test, &Test::DeleteSelf_, "the test fixture's destructor");
+    // Deletes the test object.
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        test, &Test::DeleteSelf_, "the test fixture's destructor");
 
   result_.set_elapsed_time(internal::GetTimeInMillis() - start);
 
@@ -4301,10 +4378,10 @@ enum GTestColor {
 };
 
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
 
 // Returns the character attribute for the given color.
-WORD GetColorAttribute(GTestColor color) {
+static WORD GetColorAttribute(GTestColor color) {
   switch (color) {
     case COLOR_RED:    return FOREGROUND_RED;
     case COLOR_GREEN:  return FOREGROUND_GREEN;
@@ -4313,11 +4390,42 @@ WORD GetColorAttribute(GTestColor color) {
   }
 }
 
+static int GetBitOffset(WORD color_mask) {
+  if (color_mask == 0) return 0;
+
+  int bitOffset = 0;
+  while ((color_mask & 1) == 0) {
+    color_mask >>= 1;
+    ++bitOffset;
+  }
+  return bitOffset;
+}
+
+static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
+  // Let's reuse the BG
+  static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+                                      BACKGROUND_RED | BACKGROUND_INTENSITY;
+  static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+                                      FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD existing_bg = old_color_attrs & background_mask;
+
+  WORD new_color =
+      GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
+  static const int bg_bitOffset = GetBitOffset(background_mask);
+  static const int fg_bitOffset = GetBitOffset(foreground_mask);
+
+  if (((new_color & background_mask) >> bg_bitOffset) ==
+      ((new_color & foreground_mask) >> fg_bitOffset)) {
+    new_color ^= FOREGROUND_INTENSITY;  // invert intensity
+  }
+  return new_color;
+}
+
 #else
 
 // Returns the ANSI color code for the given color.  COLOR_DEFAULT is
 // an invalid input.
-const char* GetAnsiColorCode(GTestColor color) {
+static const char* GetAnsiColorCode(GTestColor color) {
   switch (color) {
     case COLOR_RED:     return "1";
     case COLOR_GREEN:   return "2";
@@ -4333,7 +4441,7 @@ bool ShouldUseColor(bool stdout_is_tty) {
   const char* const gtest_color = GTEST_FLAG(color).c_str();
 
   if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
-#if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
     // On Windows the TERM variable is usually not set, but the
     // console there does support colors.
     return stdout_is_tty;
@@ -4346,6 +4454,10 @@ bool ShouldUseColor(bool stdout_is_tty) {
         String::CStringEquals(term, "xterm-256color") ||
         String::CStringEquals(term, "screen") ||
         String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "tmux") ||
+        String::CStringEquals(term, "tmux-256color") ||
+        String::CStringEquals(term, "rxvt-unicode") ||
+        String::CStringEquals(term, "rxvt-unicode-256color") ||
         String::CStringEquals(term, "linux") ||
         String::CStringEquals(term, "cygwin");
     return stdout_is_tty && term_supports_color;
@@ -4365,7 +4477,7 @@ bool ShouldUseColor(bool stdout_is_tty) {
 // cannot simply emit special characters and have the terminal change colors.
 // This routine must actually emit the characters rather than return a string
 // that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
@@ -4386,20 +4498,21 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   }
 
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
   CONSOLE_SCREEN_BUFFER_INFO buffer_info;
   GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
   const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD new_color = GetNewColor(color, old_color_attrs);
 
   // We need to flush the stream buffers into the console before each
   // SetConsoleTextAttribute call lest it affect the text that is already
   // printed but has not yet reached the console.
   fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  SetConsoleTextAttribute(stdout_handle, new_color);
+
   vprintf(fmt, args);
 
   fflush(stdout);
@@ -4413,12 +4526,12 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_end(args);
 }
 
-// Text printed in Google Test's text output and --gunit_list_tests
+// Text printed in Google Test's text output and --gtest_list_tests
 // output to label the type parameter and value parameter for a test.
 static const char kTypeParamLabel[] = "TypeParam";
 static const char kValueParamLabel[] = "GetParam()";
 
-void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+static void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
   const char* const type_param = test_info.type_param();
   const char* const value_param = test_info.value_param();
 
@@ -4689,7 +4802,7 @@ void TestEventRepeater::Append(TestEventListener *listener) {
   listeners_.push_back(listener);
 }
 
-// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
+// FIXME: Factor the search functionality into Vector::Find.
 TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
   for (size_t i = 0; i < listeners_.size(); ++i) {
     if (listeners_[i] == listener) {
@@ -4763,6 +4876,11 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   explicit XmlUnitTestResultPrinter(const char* output_file);
 
   virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  void ListTestsMatchingFilter(const std::vector<TestCase*>& test_cases);
+
+  // Prints an XML summary of all unit tests.
+  static void PrintXmlTestsList(std::ostream* stream,
+                                const std::vector<TestCase*>& test_cases);
 
  private:
   // Is c a whitespace character that is normalized to a space character
@@ -4824,6 +4942,11 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // to delimit this attribute from prior attributes.
   static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
 
+  // Streams an XML representation of the test properties of a TestResult
+  // object.
+  static void OutputXmlTestProperties(std::ostream* stream,
+                                      const TestResult& result);
+
   // The output file.
   const std::string output_file_;
 
@@ -4833,46 +4956,30 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
 // Creates a new XmlUnitTestResultPrinter.
 XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
     : output_file_(output_file) {
-  if (output_file_.c_str() == NULL || output_file_.empty()) {
-    fprintf(stderr, "XML output file may not be null\n");
-    fflush(stderr);
-    exit(EXIT_FAILURE);
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "XML output file may not be null";
   }
 }
 
 // Called after the unit test ends.
 void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
                                                   int /*iteration*/) {
-  FILE* xmlout = NULL;
-  FilePath output_file(output_file_);
-  FilePath output_dir(output_file.RemoveFileName());
-
-  if (output_dir.CreateDirectoriesRecursively()) {
-    xmlout = posix::FOpen(output_file_.c_str(), "w");
-  }
-  if (xmlout == NULL) {
-    // TODO(wan): report the reason of the failure.
-    //
-    // We don't do it for now as:
-    //
-    //   1. There is no urgent need for it.
-    //   2. It's a bit involved to make the errno variable thread-safe on
-    //      all three operating systems (Linux, Windows, and Mac OS).
-    //   3. To interpret the meaning of errno in a thread-safe way,
-    //      we need the strerror_r() function, which is not available on
-    //      Windows.
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            output_file_.c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
+  FILE* xmlout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintXmlUnitTest(&stream, unit_test);
   fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
   fclose(xmlout);
 }
 
+void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
+    const std::vector<TestCase*>& test_cases) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlTestsList(&stream, test_cases);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
 // Returns an XML-escaped copy of the input string str.  If is_attribute
 // is true, the text is meant to appear as an attribute value, and
 // normalizable whitespace is preserved by replacing it with character
@@ -4883,7 +4990,7 @@ void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
 // module will consist of ordinary English text.
 // If this module is ever modified to produce version 1.1 XML output,
 // most invalid characters can be retained using character references.
-// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// FIXME: It might be nice to have a minimally invasive, human-readable
 // escaping scheme for invalid characters, rather than dropping them.
 std::string XmlUnitTestResultPrinter::EscapeXml(
     const std::string& str, bool is_attribute) {
@@ -4944,6 +5051,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
 
 // The following routines generate an XML representation of a UnitTest
 // object.
+// GOOGLETEST_CM0009 DO NOT DELETE
 //
 // This is how Google Test concepts map to the DTD:
 //
@@ -4961,7 +5069,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
 // Formats the given time in milliseconds as seconds.
 std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
   ::std::stringstream ss;
-  ss << ms/1000.0;
+  ss << (static_cast<double>(ms) * 1e-3);
   return ss.str();
 }
 
@@ -5033,13 +5141,17 @@ void XmlUnitTestResultPrinter::OutputXmlAttribute(
 }
 
 // Prints an XML representation of a TestInfo object.
-// TODO(wan): There is also value in printing properties with the plain printer.
+// FIXME: There is also value in printing properties with the plain printer.
 void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
                                                  const char* test_case_name,
                                                  const TestInfo& test_info) {
   const TestResult& result = *test_info.result();
   const std::string kTestcase = "testcase";
 
+  if (test_info.is_in_another_shard()) {
+    return;
+  }
+
   *stream << "    <testcase";
   OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
 
@@ -5050,13 +5162,19 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
   if (test_info.type_param() != NULL) {
     OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
   }
+  if (GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestcase, "file", test_info.file());
+    OutputXmlAttribute(stream, kTestcase, "line",
+                       StreamableToString(test_info.line()));
+    *stream << " />\n";
+    return;
+  }
 
   OutputXmlAttribute(stream, kTestcase, "status",
                      test_info.should_run() ? "run" : "notrun");
   OutputXmlAttribute(stream, kTestcase, "time",
                      FormatTimeInMillisAsSeconds(result.elapsed_time()));
   OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
-  *stream << TestPropertiesAsXmlAttributes(result);
 
   int failures = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
@@ -5065,22 +5183,28 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
       if (++failures == 1) {
         *stream << ">\n";
       }
-      const string location = internal::FormatCompilerIndependentFileLocation(
-          part.file_name(), part.line_number());
-      const string summary = location + "\n" + part.summary();
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
       *stream << "      <failure message=\""
               << EscapeXmlAttribute(summary.c_str())
               << "\" type=\"\">";
-      const string detail = location + "\n" + part.message();
+      const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
       *stream << "</failure>\n";
     }
   }
 
-  if (failures == 0)
+  if (failures == 0 && result.test_property_count() == 0) {
     *stream << " />\n";
-  else
+  } else {
+    if (failures == 0) {
+      *stream << ">\n";
+    }
+    OutputXmlTestProperties(stream, result);
     *stream << "    </testcase>\n";
+  }
 }
 
 // Prints an XML representation of a TestCase object
@@ -5091,17 +5215,18 @@ void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
   OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
   OutputXmlAttribute(stream, kTestsuite, "tests",
                      StreamableToString(test_case.reportable_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "failures",
-                     StreamableToString(test_case.failed_test_count()));
-  OutputXmlAttribute(
-      stream, kTestsuite, "disabled",
-      StreamableToString(test_case.reportable_disabled_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
-  OutputXmlAttribute(stream, kTestsuite, "time",
-                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
-  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
-          << ">\n";
-
+  if (!GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "failures",
+                       StreamableToString(test_case.failed_test_count()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "disabled",
+        StreamableToString(test_case.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+    OutputXmlAttribute(stream, kTestsuite, "time",
+                       FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+    *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result());
+  }
+  *stream << ">\n";
   for (int i = 0; i < test_case.total_test_count(); ++i) {
     if (test_case.GetTestInfo(i)->is_reportable())
       OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
@@ -5135,7 +5260,6 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
     OutputXmlAttribute(stream, kTestsuites, "random_seed",
                        StreamableToString(unit_test.random_seed()));
   }
-
   *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
 
   OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
@@ -5148,6 +5272,28 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
   *stream << "</" << kTestsuites << ">\n";
 }
 
+void XmlUnitTestResultPrinter::PrintXmlTestsList(
+    std::ostream* stream, const std::vector<TestCase*>& test_cases) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  int total_tests = 0;
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    total_tests += test_cases[i]->total_test_count();
+  }
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(total_tests));
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    PrintXmlTestCase(stream, *test_cases[i]);
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
 // Produces a string representing the test properties in a result as space
 // delimited XML attributes based on the property key="value" pairs.
 std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
@@ -5161,130 +5307,536 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
   return attributes.GetString();
 }
 
-// End XmlUnitTestResultPrinter
+void XmlUnitTestResultPrinter::OutputXmlTestProperties(
+    std::ostream* stream, const TestResult& result) {
+  const std::string kProperties = "properties";
+  const std::string kProperty = "property";
 
-#if GTEST_CAN_STREAM_RESULTS_
+  if (result.test_property_count() <= 0) {
+    return;
+  }
 
-// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
-// replaces them by "%xx" where xx is their hexadecimal value. For
-// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
-// in both time and space -- important as the input str may contain an
-// arbitrarily long test failure message and stack trace.
-string StreamingListener::UrlEncode(const char* str) {
-  string result;
-  result.reserve(strlen(str) + 1);
-  for (char ch = *str; ch != '\0'; ch = *++str) {
-    switch (ch) {
-      case '%':
-      case '=':
-      case '&':
-      case '\n':
-        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
-        break;
-      default:
-        result.push_back(ch);
-        break;
-    }
+  *stream << "<" << kProperties << ">\n";
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    *stream << "<" << kProperty;
+    *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
+    *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
+    *stream << "/>\n";
   }
-  return result;
+  *stream << "</" << kProperties << ">\n";
 }
 
-void StreamingListener::SocketWriter::MakeConnection() {
-  GTEST_CHECK_(sockfd_ == -1)
-      << "MakeConnection() can't be called when there is already a connection.";
-
-  addrinfo hints;
-  memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
-  hints.ai_socktype = SOCK_STREAM;
-  addrinfo* servinfo = NULL;
-
-  // Use the getaddrinfo() to get a linked list of IP addresses for
-  // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
-  if (error_num != 0) {
-    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
-                        << gai_strerror(error_num);
-  }
+// End XmlUnitTestResultPrinter
 
-  // Loop through all the results and connect to the first we can.
-  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
-       cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
-    if (sockfd_ != -1) {
-      // Connect the client socket to the server socket.
-      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
-        close(sockfd_);
-        sockfd_ = -1;
-      }
-    }
-  }
+// This class generates an JSON output file.
+class JsonUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit JsonUnitTestResultPrinter(const char* output_file);
 
-  freeaddrinfo(servinfo);  // all done with this structure
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
 
-  if (sockfd_ == -1) {
-    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
-                        << host_name_ << ":" << port_num_;
-  }
-}
+  // Prints an JSON summary of all unit tests.
+  static void PrintJsonTestList(::std::ostream* stream,
+                                const std::vector<TestCase*>& test_cases);
 
-// End of class Streaming Listener
-#endif  // GTEST_CAN_STREAM_RESULTS__
+ private:
+  // Returns an JSON-escaped copy of the input string str.
+  static std::string EscapeJson(const std::string& str);
+
+  //// Verifies that the given attribute belongs to the given element and
+  //// streams the attribute as JSON.
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            const std::string& value,
+                            const std::string& indent,
+                            bool comma = true);
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            int value,
+                            const std::string& indent,
+                            bool comma = true);
+
+  // Streams a JSON representation of a TestInfo object.
+  static void OutputJsonTestInfo(::std::ostream* stream,
+                                 const char* test_case_name,
+                                 const TestInfo& test_info);
+
+  // Prints a JSON representation of a TestCase object
+  static void PrintJsonTestCase(::std::ostream* stream,
+                                const TestCase& test_case);
+
+  // Prints a JSON summary of unit_test to output stream out.
+  static void PrintJsonUnitTest(::std::ostream* stream,
+                                const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as
+  // a JSON dictionary.
+  static std::string TestPropertiesAsJson(const TestResult& result,
+                                          const std::string& indent);
 
-// Class ScopedTrace
+  // The output file.
+  const std::string output_file_;
 
-// Pushes the given source file location and message onto a per-thread
-// trace stack maintained by Google Test.
-ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  TraceInfo trace;
-  trace.file = file;
-  trace.line = line;
-  trace.message = message.GetString();
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+};
 
-  UnitTest::GetInstance()->PushGTestTrace(trace);
+// Creates a new JsonUnitTestResultPrinter.
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "JSON output file may not be null";
+  }
 }
 
-// Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  UnitTest::GetInstance()->PopGTestTrace();
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* jsonout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintJsonUnitTest(&stream, unit_test);
+  fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
+  fclose(jsonout);
 }
 
+// Returns an JSON-escaped copy of the input string str.
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) {
+  Message m;
 
-// class OsStackTraceGetter
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '\\':
+      case '"':
+      case '/':
+        m << '\\' << ch;
+        break;
+      case '\b':
+        m << "\\b";
+        break;
+      case '\t':
+        m << "\\t";
+        break;
+      case '\n':
+        m << "\\n";
+        break;
+      case '\f':
+        m << "\\f";
+        break;
+      case '\r':
+        m << "\\r";
+        break;
+      default:
+        if (ch < ' ') {
+          m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
+        } else {
+          m << ch;
+        }
+        break;
+    }
+  }
 
-// Returns the current OS stack trace as an std::string.  Parameters:
-//
-//   max_depth  - the maximum number of stack frames to be included
-//                in the trace.
-//   skip_count - the number of top frames to be skipped; doesn't count
-//                against max_depth.
-//
-string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */,
-                                             int /* skip_count */)
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  return "";
+  return m.GetString();
 }
 
-void OsStackTraceGetter::UponLeavingGTest()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
+// The following routines generate an JSON representation of a UnitTest
+// object.
+
+// Formats the given time in milliseconds as seconds.
+static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3) << "s";
+  return ss.str();
 }
 
-const char* const
-OsStackTraceGetter::kElidedFramesMarker =
+// Converts the given epoch time in milliseconds to a date string in the
+// RFC3339 format, without the timezone information.
+static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+}
+
+static inline std::string Indent(int width) {
+  return std::string(width, ' ');
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
+  if (comma)
+    *stream << ",\n";
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    int value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": " << StreamableToString(value);
+  if (comma)
+    *stream << ",\n";
+}
+
+// Prints a JSON representation of a TestInfo object.
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
+                                                   const char* test_case_name,
+                                                   const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+  const std::string kIndent = Indent(10);
+
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, kTestcase, "name", test_info.name(), kIndent);
+
+  if (test_info.value_param() != NULL) {
+    OutputJsonKey(stream, kTestcase, "value_param",
+                  test_info.value_param(), kIndent);
+  }
+  if (test_info.type_param() != NULL) {
+    OutputJsonKey(stream, kTestcase, "type_param", test_info.type_param(),
+                  kIndent);
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestcase, "file", test_info.file(), kIndent);
+    OutputJsonKey(stream, kTestcase, "line", test_info.line(), kIndent, false);
+    *stream << "\n" << Indent(8) << "}";
+    return;
+  }
+
+  OutputJsonKey(stream, kTestcase, "status",
+                test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
+  OutputJsonKey(stream, kTestcase, "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
+  OutputJsonKey(stream, kTestcase, "classname", test_case_name, kIndent, false);
+  *stream << TestPropertiesAsJson(result, kIndent);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      *stream << ",\n";
+      if (++failures == 1) {
+        *stream << kIndent << "\"" << "failures" << "\": [\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string message = EscapeJson(location + "\n" + part.message());
+      *stream << kIndent << "  {\n"
+              << kIndent << "    \"failure\": \"" << message << "\",\n"
+              << kIndent << "    \"type\": \"\"\n"
+              << kIndent << "  }";
+    }
+  }
+
+  if (failures > 0)
+    *stream << "\n" << kIndent << "]";
+  *stream << "\n" << Indent(8) << "}";
+}
+
+// Prints an JSON representation of a TestCase object
+void JsonUnitTestResultPrinter::PrintJsonTestCase(std::ostream* stream,
+                                                  const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  const std::string kIndent = Indent(6);
+
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_case.name(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "tests", test_case.reportable_test_count(),
+                kIndent);
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "failures", test_case.failed_test_count(),
+                  kIndent);
+    OutputJsonKey(stream, kTestsuite, "disabled",
+                  test_case.reportable_disabled_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
+    OutputJsonKey(stream, kTestsuite, "time",
+                  FormatTimeInMillisAsDuration(test_case.elapsed_time()),
+                  kIndent, false);
+    *stream << TestPropertiesAsJson(test_case.ad_hoc_test_result(), kIndent)
+            << ",\n";
+  }
+
+  *stream << kIndent << "\"" << kTestsuite << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable()) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      OutputJsonTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+    }
+  }
+  *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON summary of unit_test to output stream out.
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
+                                                  const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+
+  OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "disabled",
+                unit_test.reportable_disabled_test_count(), kIndent);
+  OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
+  if (GTEST_FLAG(shuffle)) {
+    OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
+                  kIndent);
+  }
+  OutputJsonKey(stream, kTestsuites, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "time",
+                FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
+                false);
+
+  *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
+          << ",\n";
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      PrintJsonTestCase(stream, *unit_test.GetTestCase(i));
+    }
+  }
+
+  *stream << "\n" << kIndent << "]\n" << "}\n";
+}
+
+void JsonUnitTestResultPrinter::PrintJsonTestList(
+    std::ostream* stream, const std::vector<TestCase*>& test_cases) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+  int total_tests = 0;
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    total_tests += test_cases[i]->total_test_count();
+  }
+  OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    if (i != 0) {
+      *stream << ",\n";
+    }
+    PrintJsonTestCase(stream, *test_cases[i]);
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+// Produces a string representing the test properties in a result as
+// a JSON dictionary.
+std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
+    const TestResult& result, const std::string& indent) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << ",\n" << indent << "\"" << property.key() << "\": "
+               << "\"" << EscapeJson(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End JsonUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+std::string StreamingListener::UrlEncode(const char* str) {
+  std::string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// class OsStackTraceGetter
+
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
     "... " GTEST_NAME_ " internal frames ...";
 
+std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  std::string result;
+
+  if (max_depth <= 0) {
+    return result;
+  }
+
+  max_depth = std::min(max_depth, kMaxStackTraceDepth);
+
+  std::vector<void*> raw_stack(max_depth);
+  // Skips the frames requested by the caller, plus this function.
+  const int raw_stack_size =
+      absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
+
+  void* caller_frame = nullptr;
+  {
+    MutexLock lock(&mutex_);
+    caller_frame = caller_frame_;
+  }
+
+  for (int i = 0; i < raw_stack_size; ++i) {
+    if (raw_stack[i] == caller_frame &&
+        !GTEST_FLAG(show_internal_stack_frames)) {
+      // Add a marker to the trace and stop adding frames.
+      absl::StrAppend(&result, kElidedFramesMarker, "\n");
+      break;
+    }
+
+    char tmp[1024];
+    const char* symbol = "(unknown)";
+    if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
+      symbol = tmp;
+    }
+
+    char line[1024];
+    snprintf(line, sizeof(line), "  %p: %s\n", raw_stack[i], symbol);
+    result += line;
+  }
+
+  return result;
+
+#else  // !GTEST_HAS_ABSL
+  static_cast<void>(max_depth);
+  static_cast<void>(skip_count);
+  return "";
+#endif  // GTEST_HAS_ABSL
+}
+
+void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  void* caller_frame = nullptr;
+  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
+    caller_frame = nullptr;
+  }
+
+  MutexLock lock(&mutex_);
+  caller_frame_ = caller_frame;
+#endif  // GTEST_HAS_ABSL
+}
+
 // A helper class that creates the premature-exit file in its
 // constructor and deletes the file in its destructor.
 class ScopedPrematureExitFile {
  public:
   explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath) {
+      : premature_exit_filepath_(premature_exit_filepath ?
+                                 premature_exit_filepath : "") {
     // If a path to the premature-exit file is specified...
-    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+    if (!premature_exit_filepath_.empty()) {
       // create the file with a single "0" character in it.  I/O
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
@@ -5295,13 +5847,18 @@ class ScopedPrematureExitFile {
   }
 
   ~ScopedPrematureExitFile() {
-    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
-      remove(premature_exit_filepath_);
+    if (!premature_exit_filepath_.empty()) {
+      int retval = remove(premature_exit_filepath_.c_str());
+      if (retval) {
+        GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
+                          << premature_exit_filepath_ << "\" with error "
+                          << retval;
+      }
     }
   }
 
  private:
-  const char* const premature_exit_filepath_;
+  const std::string premature_exit_filepath_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
 };
@@ -5571,6 +6128,11 @@ void UnitTest::AddTestPartResult(
       // when a failure happens and both the --gtest_break_on_failure and
       // the --gtest_catch_exceptions flags are specified.
       DebugBreak();
+#elif (!defined(__native_client__)) &&            \
+    ((defined(__clang__) || defined(__GNUC__)) && \
+     (defined(__x86_64__) || defined(__i386__)))
+      // with clang/gcc we can achieve the same effect on x86 by invoking int3
+      asm("int3");
 #else
       // Dereference NULL through a volatile pointer to prevent the compiler
       // from removing. We use this rather than abort() or __builtin_trap() for
@@ -5638,7 +6200,7 @@ int UnitTest::Run() {
   // used for the duration of the program.
   impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
 
-#if GTEST_HAS_SEH
+#if GTEST_OS_WINDOWS
   // Either the user wants Google Test to catch exceptions thrown by the
   // tests or this is executing in the context of death test child
   // process. In either case the user does not want to see pop-up dialogs
@@ -5667,7 +6229,7 @@ int UnitTest::Run() {
     // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
     // Users of prior VC versions shall suffer the agony and pain of
     // clicking through the countless debug dialogs.
-    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
+    // FIXME: find a way to suppress the abort dialog() in the
     // debug mode when compiled with VC 7.1 or lower.
     if (!GTEST_FLAG(break_on_failure))
       _set_abort_behavior(
@@ -5675,7 +6237,7 @@ int UnitTest::Run() {
           _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
 # endif
   }
-#endif  // GTEST_HAS_SEH
+#endif  // GTEST_OS_WINDOWS
 
   return internal::HandleExceptionsInMethodIfSupported(
       impl(),
@@ -5708,7 +6270,6 @@ const TestInfo* UnitTest::current_test_info() const
 // Returns the random seed used at the start of the current test run.
 int UnitTest::random_seed() const { return impl_->random_seed(); }
 
-#if GTEST_HAS_PARAM_TEST
 // Returns ParameterizedTestCaseRegistry object used to keep track of
 // value-parameterized tests and instantiate and register them.
 internal::ParameterizedTestCaseRegistry&
@@ -5716,7 +6277,6 @@ internal::ParameterizedTestCaseRegistry&
         GTEST_LOCK_EXCLUDED_(mutex_) {
   return impl_->parameterized_test_registry();
 }
-#endif  // GTEST_HAS_PARAM_TEST
 
 // Creates an empty UnitTest.
 UnitTest::UnitTest() {
@@ -5755,10 +6315,8 @@ UnitTestImpl::UnitTestImpl(UnitTest* parent)
           &default_global_test_part_result_reporter_),
       per_thread_test_part_result_reporter_(
           &default_per_thread_test_part_result_reporter_),
-#if GTEST_HAS_PARAM_TEST
       parameterized_test_registry_(),
       parameterized_tests_registered_(false),
-#endif  // GTEST_HAS_PARAM_TEST
       last_death_test_case_(-1),
       current_test_case_(NULL),
       current_test_info_(NULL),
@@ -5825,10 +6383,12 @@ void UnitTestImpl::ConfigureXmlOutput() {
   if (output_format == "xml") {
     listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
         UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format == "json") {
+    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
   } else if (output_format != "") {
-    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
-           output_format.c_str());
-    fflush(stdout);
+    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
+                        << output_format << "\" ignored.";
   }
 }
 
@@ -5843,9 +6403,8 @@ void UnitTestImpl::ConfigureStreamingOutput() {
       listeners()->Append(new StreamingListener(target.substr(0, pos),
                                                 target.substr(pos+1)));
     } else {
-      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
-             target.c_str());
-      fflush(stdout);
+      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
+                          << "\" ignored.";
     }
   }
 }
@@ -5861,6 +6420,11 @@ void UnitTestImpl::PostFlagParsingInit() {
   if (!post_flag_parse_init_performed_) {
     post_flag_parse_init_performed_ = true;
 
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+    // Register to send notifications about key process state changes.
+    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
 #if GTEST_HAS_DEATH_TEST
     InitDeathTestSubprocessControlInfo();
     SuppressTestEventsIfInSubprocess();
@@ -5879,6 +6443,13 @@ void UnitTestImpl::PostFlagParsingInit() {
     // Configures listeners for streaming test results to the specified server.
     ConfigureStreamingOutput();
 #endif  // GTEST_CAN_STREAM_RESULTS_
+
+#if GTEST_HAS_ABSL
+    if (GTEST_FLAG(install_failure_signal_handler)) {
+      absl::FailureSignalHandlerOptions options;
+      absl::InstallFailureSignalHandler(options);
+    }
+#endif  // GTEST_HAS_ABSL
   }
 }
 
@@ -5922,11 +6493,11 @@ TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
                                     Test::SetUpTestCaseFunc set_up_tc,
                                     Test::TearDownTestCaseFunc tear_down_tc) {
   // Can we find a TestCase with the given name?
-  const std::vector<TestCase*>::const_iterator test_case =
-      std::find_if(test_cases_.begin(), test_cases_.end(),
+  const std::vector<TestCase*>::const_reverse_iterator test_case =
+      std::find_if(test_cases_.rbegin(), test_cases_.rend(),
                    TestCaseNameIs(test_case_name));
 
-  if (test_case != test_cases_.end())
+  if (test_case != test_cases_.rend())
     return *test_case;
 
   // No.  Let's create one.
@@ -5967,13 +6538,8 @@ static void TearDownEnvironment(Environment* env) { env->TearDown(); }
 // All other functions called from RunAllTests() may safely assume that
 // parameterized tests are ready to be counted and run.
 bool UnitTestImpl::RunAllTests() {
-  // Makes sure InitGoogleTest() was called.
-  if (!GTestIsInitialized()) {
-    printf("%s",
-           "\nThis test program did NOT call ::testing::InitGoogleTest "
-           "before calling RUN_ALL_TESTS().  Please fix it.\n");
-    return false;
-  }
+  // True iff Google Test is initialized before RUN_ALL_TESTS() is called.
+  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 
   // Do not run any test if the --help flag was specified.
   if (g_help_flag)
@@ -5994,6 +6560,11 @@ bool UnitTestImpl::RunAllTests() {
 
 #if GTEST_HAS_DEATH_TEST
   in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  if (in_subprocess_for_death_test) {
+    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+  }
+# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 #endif  // GTEST_HAS_DEATH_TEST
 
   const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -6096,6 +6667,20 @@ bool UnitTestImpl::RunAllTests() {
 
   repeater->OnTestProgramEnd(*parent_);
 
+  if (!gtest_is_initialized_before_run_all_tests) {
+    ColoredPrintf(
+        COLOR_RED,
+        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
+        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
+        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
+        " will start to enforce the valid usage. "
+        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
+#if GTEST_FOR_GOOGLE_
+    ColoredPrintf(COLOR_RED,
+                  "For more details, see http://wiki/Main/ValidGUnitMain.\n");
+#endif  // GTEST_FOR_GOOGLE_
+  }
+
   return !failed;
 }
 
@@ -6197,8 +6782,8 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
 // each TestCase and TestInfo object.
 // If shard_tests == true, further filters tests based on sharding
 // variables in the environment - see
-// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
-// Returns the number of tests that should run.
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
+// . Returns the number of tests that should run.
 int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
   const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
       Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
@@ -6237,10 +6822,11 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
           (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
           matches_filter;
 
-      const bool is_selected = is_runnable &&
-          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
-           ShouldRunTestOnShard(total_shards, shard_index,
-                                num_runnable_tests));
+      const bool is_in_another_shard =
+          shard_tests != IGNORE_SHARDING_PROTOCOL &&
+          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
+      test_info->is_in_another_shard_ = is_in_another_shard;
+      const bool is_selected = is_runnable && !is_in_another_shard;
 
       num_runnable_tests += is_runnable;
       num_selected_tests += is_selected;
@@ -6310,6 +6896,23 @@ void UnitTestImpl::ListTestsMatchingFilter() {
     }
   }
   fflush(stdout);
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml" || output_format == "json") {
+    FILE* fileout = OpenFileForWriting(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
+    std::stringstream stream;
+    if (output_format == "xml") {
+      XmlUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintXmlTestsList(&stream, test_cases_);
+    } else if (output_format == "json") {
+      JsonUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintJsonTestList(&stream, test_cases_);
+    }
+    fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
+    fclose(fileout);
+  }
 }
 
 // Sets the OS stack trace getter.
@@ -6330,17 +6933,25 @@ void UnitTestImpl::set_os_stack_trace_getter(
 // getter, and returns it.
 OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
   if (os_stack_trace_getter_ == NULL) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
     os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif  // GTEST_OS_STACK_TRACE_GETTER_
   }
 
   return os_stack_trace_getter_;
 }
 
-// Returns the TestResult for the test that's currently running, or
-// the TestResult for the ad hoc test if no test is running.
+// Returns the most specific TestResult currently running.
 TestResult* UnitTestImpl::current_test_result() {
-  return current_test_info_ ?
-      &(current_test_info_->result_) : &ad_hoc_test_result_;
+  if (current_test_info_ != NULL) {
+    return &current_test_info_->result_;
+  }
+  if (current_test_case_ != NULL) {
+    return &current_test_case_->ad_hoc_test_result_;
+  }
+  return &ad_hoc_test_result_;
 }
 
 // Shuffles all test cases, and the tests within each test case,
@@ -6421,9 +7032,8 @@ bool SkipPrefix(const char* prefix, const char** pstr) {
 // part can be omitted.
 //
 // Returns the value of the flag, or NULL if the parsing failed.
-const char* ParseFlagValue(const char* str,
-                           const char* flag,
-                           bool def_optional) {
+static const char* ParseFlagValue(const char* str, const char* flag,
+                                  bool def_optional) {
   // str and flag must not be NULL.
   if (str == NULL || flag == NULL) return NULL;
 
@@ -6459,7 +7069,7 @@ const char* ParseFlagValue(const char* str,
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, true);
 
@@ -6493,7 +7103,8 @@ bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+template <typename String>
+static bool ParseStringFlag(const char* str, const char* flag, String* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
 
@@ -6529,7 +7140,7 @@ static bool HasGoogleTestFlagPrefix(const char* str) {
 //   @Y    changes the color to yellow.
 //   @D    changes to the default terminal text color.
 //
-// TODO(wan@google.com): Write tests for this once we add stdout
+// FIXME: Write tests for this once we add stdout
 // capturing to Google Test.
 static void PrintColorEncoded(const char* str) {
   GTestColor color = COLOR_DEFAULT;  // The current color.
@@ -6595,24 +7206,25 @@ static const char kColorEncodedHelpMessage[] =
 "      Enable/disable colored output. The default is @Gauto@D.\n"
 "  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
 "      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
+"  @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G"
     GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate an XML report in the given directory or with the given file\n"
-"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
-#if GTEST_CAN_STREAM_RESULTS_
+"      Generate a JSON or XML report in the given directory or with the given\n"
+"      file name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+# if GTEST_CAN_STREAM_RESULTS_
 "  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
 "      Stream test results to the given server.\n"
-#endif  // GTEST_CAN_STREAM_RESULTS_
+# endif  // GTEST_CAN_STREAM_RESULTS_
 "\n"
 "Assertion Behavior:\n"
-#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
 "  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
 "      Set the default death test style.\n"
-#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
 "  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
 "      Turn assertion failures into debugger break-points.\n"
 "  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions.\n"
+"      Turn assertion failures into C++ exceptions for use by an external\n"
+"      test framework.\n"
 "  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
 "      Do not report exceptions as test failures. Instead, allow them\n"
 "      to crash the program or throw a pop-up (on Windows).\n"
@@ -6629,6 +7241,56 @@ static const char kColorEncodedHelpMessage[] =
 "(not one in your own code or tests), please report it to\n"
 "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
+static bool ParseGoogleTestFlag(const char* const arg) {
+  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                       &GTEST_FLAG(also_run_disabled_tests)) ||
+      ParseBoolFlag(arg, kBreakOnFailureFlag,
+                    &GTEST_FLAG(break_on_failure)) ||
+      ParseBoolFlag(arg, kCatchExceptionsFlag,
+                    &GTEST_FLAG(catch_exceptions)) ||
+      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+      ParseStringFlag(arg, kDeathTestStyleFlag,
+                      &GTEST_FLAG(death_test_style)) ||
+      ParseBoolFlag(arg, kDeathTestUseFork,
+                    &GTEST_FLAG(death_test_use_fork)) ||
+      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+      ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                      &GTEST_FLAG(internal_run_death_test)) ||
+      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+      ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
+      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+      ParseInt32Flag(arg, kStackTraceDepthFlag,
+                     &GTEST_FLAG(stack_trace_depth)) ||
+      ParseStringFlag(arg, kStreamResultToFlag,
+                      &GTEST_FLAG(stream_result_to)) ||
+      ParseBoolFlag(arg, kThrowOnFailureFlag,
+                    &GTEST_FLAG(throw_on_failure));
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+static void LoadFlagsFromFile(const std::string& path) {
+  FILE* flagfile = posix::FOpen(path.c_str(), "r");
+  if (!flagfile) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+                      << "\"";
+  }
+  std::string contents(ReadEntireFile(flagfile));
+  posix::FClose(flagfile);
+  std::vector<std::string> lines;
+  SplitString(contents, '\n', &lines);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    if (lines[i].empty())
+      continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str()))
+      g_help_flag = true;
+  }
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.  The type parameter CharType can be
 // instantiated to either char or wchar_t.
@@ -6642,35 +7304,24 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
     using internal::ParseInt32Flag;
     using internal::ParseStringFlag;
 
-    // Do we see a Google Test flag?
-    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                      &GTEST_FLAG(also_run_disabled_tests)) ||
-        ParseBoolFlag(arg, kBreakOnFailureFlag,
-                      &GTEST_FLAG(break_on_failure)) ||
-        ParseBoolFlag(arg, kCatchExceptionsFlag,
-                      &GTEST_FLAG(catch_exceptions)) ||
-        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-        ParseStringFlag(arg, kDeathTestStyleFlag,
-                        &GTEST_FLAG(death_test_style)) ||
-        ParseBoolFlag(arg, kDeathTestUseFork,
-                      &GTEST_FLAG(death_test_use_fork)) ||
-        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-        ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                        &GTEST_FLAG(internal_run_death_test)) ||
-        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-        ParseInt32Flag(arg, kStackTraceDepthFlag,
-                       &GTEST_FLAG(stack_trace_depth)) ||
-        ParseStringFlag(arg, kStreamResultToFlag,
-                        &GTEST_FLAG(stream_result_to)) ||
-        ParseBoolFlag(arg, kThrowOnFailureFlag,
-                      &GTEST_FLAG(throw_on_failure))
-        ) {
-      // Yes.  Shift the remainder of the argv list left by one.  Note
+    bool remove_flag = false;
+    if (ParseGoogleTestFlag(arg)) {
+      remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
+      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+      remove_flag = true;
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+
+    if (remove_flag) {
+      // Shift the remainder of the argv list left by one.  Note
       // that argv has (*argc + 1) elements, the last one always being
       // NULL.  The following loop moves the trailing NULL element as
       // well.
@@ -6684,12 +7335,6 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
       // We also need to decrement the iterator as we just removed
       // an element.
       i--;
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
-      // Both help flag and unrecognized Google Test flags (excluding
-      // internal ones) trigger help display.
-      g_help_flag = true;
     }
   }
 
@@ -6705,6 +7350,17 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
 // other parts of Google Test.
 void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
+
+  // Fix the value of *_NSGetArgc() on macOS, but iff
+  // *_NSGetArgv() == argv
+  // Only applicable to char** version of argv
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+  if (*_NSGetArgv() == argv) {
+    *_NSGetArgc() = *argc;
+  }
+#endif
+#endif
 }
 void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
@@ -6716,23 +7372,19 @@ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
 // wchar_t.
 template <typename CharType>
 void InitGoogleTestImpl(int* argc, CharType** argv) {
-  g_init_gtest_count++;
-
   // We don't want to run the initialization code twice.
-  if (g_init_gtest_count != 1) return;
+  if (GTestIsInitialized()) return;
 
   if (*argc <= 0) return;
 
-  internal::g_executable_path = internal::StreamableToString(argv[0]);
-
-#if GTEST_HAS_DEATH_TEST
-
   g_argvs.clear();
   for (int i = 0; i != *argc; i++) {
     g_argvs.push_back(StreamableToString(argv[i]));
   }
 
-#endif  // GTEST_HAS_DEATH_TEST
+#if GTEST_HAS_ABSL
+  absl::InitializeSymbolizer(g_argvs[0].c_str());
+#endif  // GTEST_HAS_ABSL
 
   ParseGoogleTestFlagsOnly(argc, argv);
   GetUnitTestImpl()->PostFlagParsingInit();
@@ -6750,13 +7402,62 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 //
 // Calling the function for the second time has no user-visible effect.
 void InitGoogleTest(int* argc, char** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
 // This overloaded version can be used in Windows programs compiled in
 // UNICODE mode.
 void InitGoogleTest(int* argc, wchar_t** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+std::string TempDir() {
+#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+  return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char* temp_dir = internal::posix::GetEnv("TEMP");
+  if (temp_dir == NULL || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  return "/sdcard/";
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
+  internal::TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message.swap(message);
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
 }
 
 }  // namespace testing
@@ -6788,8 +7489,7 @@ void InitGoogleTest(int* argc, wchar_t** argv) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+
 //
 // This file implements death tests.
 
@@ -6821,23 +7521,27 @@ void InitGoogleTest(int* argc, wchar_t** argv) {
 #  include <spawn.h>
 # endif  // GTEST_OS_QNX
 
-#endif  // GTEST_HAS_DEATH_TEST
+# if GTEST_OS_FUCHSIA
+#  include <lib/fdio/io.h>
+#  include <lib/fdio/spawn.h>
+#  include <zircon/processargs.h>
+#  include <zircon/syscalls.h>
+#  include <zircon/syscalls/port.h>
+# endif  // GTEST_OS_FUCHSIA
 
+#endif  // GTEST_HAS_DEATH_TEST
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 
 // Constants.
 
 // The default death test style.
-static const char kDefaultDeathTestStyle[] = "fast";
+//
+// This is defined in internal/gtest-port.h as "fast", but can be overridden by
+// a definition in internal/custom/gtest-port.h. The recommended value, which is
+// used internally at Google, is "threadsafe".
+static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
 
 GTEST_DEFINE_string_(
     death_test_style,
@@ -6877,7 +7581,9 @@ namespace internal {
 
 // Valid only for fast death tests. Indicates the code is running in the
 // child process of a fast style death test.
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 static bool g_in_fast_death_test_child = false;
+# endif
 
 // Returns a Boolean value indicating whether the caller is currently
 // executing in the context of the death test child process.  Tools such as
@@ -6885,10 +7591,10 @@ static bool g_in_fast_death_test_child = false;
 // tests.  IMPORTANT: This is an internal utility.  Using it may break the
 // implementation of death tests.  User code MUST NOT use it.
 bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
-  // On Windows, death tests are thread-safe regardless of the value of the
-  // death_test_style flag.
+  // On Windows and Fuchsia, death tests are thread-safe regardless of the value
+  // of the death_test_style flag.
   return !GTEST_FLAG(internal_run_death_test).empty();
 
 # else
@@ -6908,7 +7614,7 @@ ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
 
 // ExitedWithCode function-call operator.
 bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return exit_status == exit_code_;
 
@@ -6916,19 +7622,27 @@ bool ExitedWithCode::operator()(int exit_status) const {
 
   return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
 
-# endif  // GTEST_OS_WINDOWS
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 }
 
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // KilledBySignal constructor.
 KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
 }
 
 // KilledBySignal function-call operator.
 bool KilledBySignal::operator()(int exit_status) const {
+#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  {
+    bool result;
+    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+      return result;
+    }
+  }
+#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
-# endif  // !GTEST_OS_WINDOWS
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 namespace internal {
 
@@ -6939,7 +7653,7 @@ namespace internal {
 static std::string ExitSummary(int exit_code) {
   Message m;
 
-# if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   m << "Exited with exit status " << exit_code;
 
@@ -6955,7 +7669,7 @@ static std::string ExitSummary(int exit_code) {
     m << " (core dumped)";
   }
 #  endif
-# endif  // GTEST_OS_WINDOWS
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return m.GetString();
 }
@@ -6966,7 +7680,7 @@ bool ExitedUnsuccessfully(int exit_status) {
   return !ExitedWithCode(0)(exit_status);
 }
 
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Generates a textual failure message when a death test finds more than
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
@@ -6975,13 +7689,19 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
   Message msg;
   msg << "Death tests use fork(), which is unsafe particularly"
       << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
-  if (thread_count == 0)
+  if (thread_count == 0) {
     msg << "couldn't detect the number of threads.";
-  else
+  } else {
     msg << "detected " << thread_count << " threads.";
+  }
+  msg << " See "
+         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "advanced.md#death-tests-and-threads"
+      << " for more explanation and suggested solutions, especially if"
+      << " this is the last message you see before your test times out.";
   return msg.GetString();
 }
-# endif  // !GTEST_OS_WINDOWS
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 // Flag characters for reporting a death test that did not die.
 static const char kDeathTestLived = 'L';
@@ -6989,6 +7709,13 @@ static const char kDeathTestReturned = 'R';
 static const char kDeathTestThrew = 'T';
 static const char kDeathTestInternalError = 'I';
 
+#if GTEST_OS_FUCHSIA
+
+// File descriptor used for the pipe in the child process.
+static const int kFuchsiaReadPipeFd = 3;
+
+#endif
+
 // An enumeration describing all of the possible ways that a death test can
 // conclude.  DIED means that the process died while executing the test
 // code; LIVED means that process lived beyond the end of the test code;
@@ -6996,7 +7723,7 @@ static const char kDeathTestInternalError = 'I';
 // statement, which is not allowed; THREW means that the test statement
 // returned control by throwing an exception.  IN_PROGRESS means the test
 // has not yet concluded.
-// TODO(vladl@google.com): Unify names and possibly values for
+// FIXME: Unify names and possibly values for
 // AbortReason, DeathTestOutcome, and flag characters above.
 enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
 
@@ -7005,7 +7732,7 @@ enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
 // message is propagated back to the parent process.  Otherwise, the
 // message is simply printed to stderr.  In either case, the program
 // then exits with status 1.
-void DeathTestAbort(const std::string& message) {
+static void DeathTestAbort(const std::string& message) {
   // On a POSIX system, this function may be called from a threadsafe-style
   // death test child process, which operates on a very small stack.  Use
   // the heap for any additional non-minuscule memory requirements.
@@ -7309,7 +8036,12 @@ bool DeathTestImpl::Passed(bool status_ok) {
       break;
     case DIED:
       if (status_ok) {
+# if GTEST_USES_PCRE
+        // PCRE regexes support embedded NULs.
+        const bool matched = RE::PartialMatch(error_message, *regex());
+# else
         const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+# endif  // GTEST_USES_PCRE
         if (matched) {
           success = true;
         } else {
@@ -7484,48 +8216,241 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
       "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
       "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
 
-  char executable_path[_MAX_PATH + 1];  // NOLINT
-  GTEST_DEATH_TEST_CHECK_(
-      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
-                                            executable_path,
-                                            _MAX_PATH));
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# elif GTEST_OS_FUCHSIA
+
+class FuchsiaDeathTest : public DeathTestImpl {
+ public:
+  FuchsiaDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+  virtual ~FuchsiaDeathTest() {
+    zx_status_t status = zx_handle_close(child_process_);
+    GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+    status = zx_handle_close(port_);
+    GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  }
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+
+  zx_handle_t child_process_ = ZX_HANDLE_INVALID;
+  zx_handle_t port_ = ZX_HANDLE_INVALID;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+  int size() {
+    return args_.size() - 1;
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int FuchsiaDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Register to wait for the child process to terminate.
+  zx_status_t status_zx;
+  status_zx = zx_object_wait_async(child_process_,
+                                   port_,
+                                   0 /* key */,
+                                   ZX_PROCESS_TERMINATED,
+                                   ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Wait for it to terminate, or an exception to be received.
+  zx_port_packet_t packet;
+  status_zx = zx_port_wait(port_, ZX_TIME_INFINITE, &packet);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  if (ZX_PKT_IS_EXCEPTION(packet.type)) {
+    // Process encountered an exception. Kill it directly rather than letting
+    // other handlers process the event.
+    status_zx = zx_task_kill(child_process_);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+    // Now wait for |child_process_| to terminate.
+    zx_signals_t signals = 0;
+    status_zx = zx_object_wait_one(
+        child_process_, ZX_PROCESS_TERMINATED, ZX_TIME_INFINITE, &signals);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+    GTEST_DEATH_TEST_CHECK_(signals & ZX_PROCESS_TERMINATED);
+  } else {
+    // Process terminated.
+    GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+    GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
+  }
+
+  ReadAndInterpretStatusByte();
 
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
+  zx_info_process_t buffer;
+  status_zx = zx_object_get_info(
+      child_process_,
+      ZX_INFO_PROCESS,
+      &buffer,
+      sizeof(buffer),
+      nullptr,
+      nullptr);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  GTEST_DEATH_TEST_CHECK_(buffer.exited);
+  set_status(buffer.return_code);
+  return status();
+}
 
-  DeathTest::set_last_death_test_message("");
+// The AssumeRole process for a Fuchsia death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(kFuchsiaReadPipeFd);
+    return EXECUTE_TEST;
+  }
 
   CaptureStderr();
   // Flush the log buffers since the log streams are shared with the child.
   FlushInfoLog();
 
-  // The child process will share the standard handles with the parent.
-  STARTUPINFOA startup_info;
-  memset(&startup_info, 0, sizeof(STARTUPINFO));
-  startup_info.dwFlags = STARTF_USESTDHANDLES;
-  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
-  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
-  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+  // Build the child process command line.
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|"
+      + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index);
+  Arguments args;
+  args.AddArguments(GetInjectableArgvs());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  // Build the pipe for communication with the child.
+  zx_status_t status;
+  zx_handle_t child_pipe_handle;
+  uint32_t type;
+  status = fdio_pipe_half(&child_pipe_handle, &type);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+  set_read_fd(status);
+
+  // Set the pipe handle for the child.
+  fdio_spawn_action_t add_handle_action = {};
+  add_handle_action.action = FDIO_SPAWN_ACTION_ADD_HANDLE;
+  add_handle_action.h.id = PA_HND(type, kFuchsiaReadPipeFd);
+  add_handle_action.h.handle = child_pipe_handle;
+
+  // Spawn the child process.
+  status = fdio_spawn_etc(ZX_HANDLE_INVALID, FDIO_SPAWN_CLONE_ALL,
+                          args.Argv()[0], args.Argv(), nullptr, 1,
+                          &add_handle_action, &child_process_, nullptr);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Create an exception port and attach it to the |child_process_|, to allow
+  // us to suppress the system default exception handler from firing.
+  status = zx_port_create(0, &port_);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  status = zx_task_bind_exception_port(
+      child_process_, port_, 0 /* key */, 0 /*options */);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
-  PROCESS_INFORMATION process_info;
-  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
-      executable_path,
-      const_cast<char*>(command_line.c_str()),
-      NULL,   // Retuned process handle is not inheritable.
-      NULL,   // Retuned thread handle is not inheritable.
-      TRUE,   // Child inherits all inheritable handles (for write_handle_).
-      0x0,    // Default creation flags.
-      NULL,   // Inherit the parent's environment.
-      UnitTest::GetInstance()->original_working_dir(),
-      &startup_info,
-      &process_info) != FALSE);
-  child_handle_.Reset(process_info.hProcess);
-  ::CloseHandle(process_info.hThread);
   set_spawned(true);
   return OVERSEE_TEST;
 }
-# else  // We are not on Windows.
+
+#else  // We are neither on Windows, nor on Fuchsia.
 
 // ForkingDeathTest provides implementations for most of the abstract
 // methods of the DeathTest interface.  Only the AssumeRole method is
@@ -7629,9 +8554,13 @@ class ExecDeathTest : public ForkingDeathTest {
       ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
   virtual TestRole AssumeRole();
  private:
-  static ::std::vector<testing::internal::string>
-  GetArgvsForDeathTestChildProcess() {
-    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+  static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
+    ::std::vector<std::string> args = GetInjectableArgvs();
+#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<std::string> extra_args =
+        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+    args.insert(args.end(), extra_args.begin(), extra_args.end());
+#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     return args;
   }
   // The name of the file in which the death test is located.
@@ -7727,6 +8656,7 @@ static int ExecDeathTestChildMain(void* child_arg) {
 }
 #  endif  // !GTEST_OS_QNX
 
+#  if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
 // grows.
 // This could be accomplished more elegantly by a single recursive
@@ -7736,20 +8666,22 @@ static int ExecDeathTestChildMain(void* child_arg) {
 // GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
 // StackLowerThanAddress into StackGrowsDown, which then doesn't give
 // correct answer.
-void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
-void StackLowerThanAddress(const void* ptr, bool* result) {
+static void StackLowerThanAddress(const void* ptr,
+                                  bool* result) GTEST_NO_INLINE_;
+static void StackLowerThanAddress(const void* ptr, bool* result) {
   int dummy;
   *result = (&dummy < ptr);
 }
 
 // Make sure AddressSanitizer does not tamper with the stack here.
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-bool StackGrowsDown() {
+static bool StackGrowsDown() {
   int dummy;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
+#  endif  // GTEST_HAS_CLONE
 
 // Spawns a child process with the same executable as the current process in
 // a thread-safe manner and instructs it to run the death test.  The
@@ -7941,6 +8873,13 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
     *test = new WindowsDeathTest(statement, regex, file, line);
   }
 
+# elif GTEST_OS_FUCHSIA
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new FuchsiaDeathTest(statement, regex, file, line);
+  }
+
 # else
 
   if (GTEST_FLAG(death_test_style) == "threadsafe") {
@@ -7961,31 +8900,11 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
   return true;
 }
 
-// Splits a given string on a given delimiter, populating a given
-// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
-// ::std::string, so we can use it here.
-static void SplitString(const ::std::string& str, char delimiter,
-                        ::std::vector< ::std::string>* dest) {
-  ::std::vector< ::std::string> parsed;
-  ::std::string::size_type pos = 0;
-  while (::testing::internal::AlwaysTrue()) {
-    const ::std::string::size_type colon = str.find(delimiter, pos);
-    if (colon == ::std::string::npos) {
-      parsed.push_back(str.substr(pos));
-      break;
-    } else {
-      parsed.push_back(str.substr(pos, colon - pos));
-      pos = colon + 1;
-    }
-  }
-  dest->swap(parsed);
-}
-
 # if GTEST_OS_WINDOWS
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
 // handle. This function is called in the child process only.
-int GetStatusFileDescriptor(unsigned int parent_process_id,
+static int GetStatusFileDescriptor(unsigned int parent_process_id,
                             size_t write_handle_as_size_t,
                             size_t event_handle_as_size_t) {
   AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
@@ -7996,7 +8915,7 @@ int GetStatusFileDescriptor(unsigned int parent_process_id,
                    StreamableToString(parent_process_id));
   }
 
-  // TODO(vladl@google.com): Replace the following check with a
+  // FIXME: Replace the following check with a
   // compile-time assertion when available.
   GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
 
@@ -8004,7 +8923,7 @@ int GetStatusFileDescriptor(unsigned int parent_process_id,
       reinterpret_cast<HANDLE>(write_handle_as_size_t);
   HANDLE dup_write_handle;
 
-  // The newly initialized handle is accessible only in in the parent
+  // The newly initialized handle is accessible only in the parent
   // process. To obtain one accessible within the child, we need to use
   // DuplicateHandle.
   if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
@@ -8081,6 +9000,16 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
   write_fd = GetStatusFileDescriptor(parent_process_id,
                                      write_handle_as_size_t,
                                      event_handle_as_size_t);
+
+# elif GTEST_OS_FUCHSIA
+
+  if (fields.size() != 3
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
 # else
 
   if (fields.size() != 4
@@ -8129,8 +9058,6 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: keith.ray@gmail.com (Keith Ray)
 
 
 #include <stdlib.h>
@@ -8148,6 +9075,7 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
 # include <climits>  // Some Linux distributions define PATH_MAX here.
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
+
 #if GTEST_OS_WINDOWS
 # define GTEST_PATH_MAX_ _MAX_PATH
 #elif defined(PATH_MAX)
@@ -8158,7 +9086,6 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
 # define GTEST_PATH_MAX_ _POSIX_PATH_MAX
 #endif  // GTEST_OS_WINDOWS
 
-
 namespace testing {
 namespace internal {
 
@@ -8351,7 +9278,7 @@ bool FilePath::DirectoryExists() const {
 // root directory per disk drive.)
 bool FilePath::IsRootDirectory() const {
 #if GTEST_OS_WINDOWS
-  // TODO(wan@google.com): on Windows a network share like
+  // FIXME: on Windows a network share like
   // \\server\share can be a root directory, although it cannot be the
   // current directory.  Handle this properly.
   return pathname_.length() == 3 && IsAbsolutePath();
@@ -8451,7 +9378,7 @@ FilePath FilePath::RemoveTrailingPathSeparator() const {
 // Removes any redundant separators that might be in the pathname.
 // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
 // redundancies that might be in a pathname involving "." or "..".
-// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
+// FIXME: handle Windows network shares (e.g. \\server\share).
 void FilePath::Normalize() {
   if (pathname_.c_str() == NULL) {
     pathname_ = "";
@@ -8512,14 +9439,14 @@ void FilePath::Normalize() {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 
 #include <limits.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <fstream>
 
 #if GTEST_OS_WINDOWS
 # include <windows.h>
@@ -8542,14 +9469,16 @@ void FilePath::Normalize() {
 # include <sys/procfs.h>
 #endif  // GTEST_OS_QNX
 
+#if GTEST_OS_AIX
+# include <procinfo.h>
+# include <sys/types.h>
+#endif  // GTEST_OS_AIX
+
+#if GTEST_OS_FUCHSIA
+# include <zircon/process.h>
+# include <zircon/syscalls.h>
+#endif  // GTEST_OS_FUCHSIA
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 namespace internal {
@@ -8563,10 +9492,31 @@ const int kStdOutFileno = STDOUT_FILENO;
 const int kStdErrFileno = STDERR_FILENO;
 #endif  // _MSC_VER
 
-#if GTEST_OS_MAC
+#if GTEST_OS_LINUX
+
+namespace {
+template <typename T>
+T ReadProcFileField(const std::string& filename, int field) {
+  std::string dummy;
+  std::ifstream file(filename.c_str());
+  while (field-- > 0) {
+    file >> dummy;
+  }
+  T output = 0;
+  file >> output;
+  return output;
+}
+}  // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+  const std::string filename =
+      (Message() << "/proc/" << getpid() << "/stat").GetString();
+  return ReadProcFileField<int>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
 
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
 size_t GetThreadCount() {
   const task_t task = mach_task_self();
   mach_msg_type_number_t thread_count;
@@ -8604,6 +9554,38 @@ size_t GetThreadCount() {
   }
 }
 
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+  struct procentry64 entry;
+  pid_t pid = getpid();
+  int status = getprocs64(&entry, sizeof(entry), NULL, 0, &pid, 1);
+  if (status == 1) {
+    return entry.pi_thcount;
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_FUCHSIA
+
+size_t GetThreadCount() {
+  int dummy_buffer;
+  size_t avail;
+  zx_status_t status = zx_object_get_info(
+      zx_process_self(),
+      ZX_INFO_PROCESS_THREADS,
+      &dummy_buffer,
+      0,
+      nullptr,
+      &avail);
+  if (status == ZX_OK) {
+    return avail;
+  } else {
+    return 0;
+  }
+}
+
 #else
 
 size_t GetThreadCount() {
@@ -8612,7 +9594,7 @@ size_t GetThreadCount() {
   return 0;
 }
 
-#endif  // GTEST_OS_MAC
+#endif  // GTEST_OS_LINUX
 
 #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
 
@@ -8686,9 +9668,9 @@ Mutex::Mutex()
 Mutex::~Mutex() {
   // Static mutexes are leaked intentionally. It is not thread-safe to try
   // to clean them up.
-  // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires
+  // FIXME: Switch to Slim Reader/Writer (SRW) Locks, which requires
   // nothing to clean it up but is available only on Vista and later.
-  // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx
+  // https://docs.microsoft.com/en-us/windows/desktop/Sync/slim-reader-writer--srw--locks
   if (type_ == kDynamic) {
     ::DeleteCriticalSection(critical_section_);
     delete critical_section_;
@@ -8719,6 +9701,43 @@ void Mutex::AssertHeld() {
       << "The current thread is not holding the mutex @" << this;
 }
 
+namespace {
+
+// Use the RAII idiom to flag mem allocs that are intentionally never
+// deallocated. The motivation is to silence the false positive mem leaks
+// that are reported by the debug version of MS's CRT which can only detect
+// if an alloc is missing a matching deallocation.
+// Example:
+//    MemoryIsNotDeallocated memory_is_not_deallocated;
+//    critical_section_ = new CRITICAL_SECTION;
+//
+class MemoryIsNotDeallocated
+{
+ public:
+  MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
+#ifdef _MSC_VER
+    old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
+    // doesn't report mem leak if there's no matching deallocation.
+    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+#endif  //  _MSC_VER
+  }
+
+  ~MemoryIsNotDeallocated() {
+#ifdef _MSC_VER
+    // Restore the original _CRTDBG_ALLOC_MEM_DF flag
+    _CrtSetDbgFlag(old_crtdbg_flag_);
+#endif  //  _MSC_VER
+  }
+
+ private:
+  int old_crtdbg_flag_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+};
+
+}  // namespace
+
 // Initializes owner_thread_id_ and critical_section_ in static mutexes.
 void Mutex::ThreadSafeLazyInit() {
   // Dynamic mutexes are initialized in the constructor.
@@ -8729,7 +9748,11 @@ void Mutex::ThreadSafeLazyInit() {
         // If critical_section_init_phase_ was 0 before the exchange, we
         // are the first to test it and need to perform the initialization.
         owner_thread_id_ = 0;
-        critical_section_ = new CRITICAL_SECTION;
+        {
+          // Use RAII to flag that following mem alloc is never deallocated.
+          MemoryIsNotDeallocated memory_is_not_deallocated;
+          critical_section_ = new CRITICAL_SECTION;
+        }
         ::InitializeCriticalSection(critical_section_);
         // Updates the critical_section_init_phase_ to 2 to signal
         // initialization complete.
@@ -8768,7 +9791,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
                              Notification* thread_can_start) {
     ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
     DWORD thread_id;
-    // TODO(yukawa): Consider to use _beginthreadex instead.
+    // FIXME: Consider to use _beginthreadex instead.
     HANDLE thread_handle = ::CreateThread(
         NULL,    // Default security.
         0,       // Default stack size.
@@ -8936,7 +9959,7 @@ class ThreadLocalRegistryImpl {
                                  FALSE,
                                  thread_id);
     GTEST_CHECK_(thread != NULL);
-    // We need to to pass a valid thread ID pointer into CreateThread for it
+    // We need to pass a valid thread ID pointer into CreateThread for it
     // to work correctly under Win98.
     DWORD watcher_thread_id;
     HANDLE watcher_thread = ::CreateThread(
@@ -8971,7 +9994,8 @@ class ThreadLocalRegistryImpl {
   // Returns map of thread local instances.
   static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
     mutex_.AssertHeld();
-    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals;
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals();
     return map;
   }
 
@@ -9111,7 +10135,7 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
 }
 
 // Helper function used by ValidateRegex() to format error messages.
-std::string FormatRegexSyntaxError(const char* regex, int index) {
+static std::string FormatRegexSyntaxError(const char* regex, int index) {
   return (Message() << "Syntax error at index " << index
           << " in simple regular expression \"" << regex << "\": ").GetString();
 }
@@ -9120,7 +10144,7 @@ std::string FormatRegexSyntaxError(const char* regex, int index) {
 // otherwise returns true.
 bool ValidateRegex(const char* regex) {
   if (regex == NULL) {
-    // TODO(wan@google.com): fix the source file location in the
+    // FIXME: fix the source file location in the
     // assertion failures to match where the regex is used in user
     // code.
     ADD_FAILURE() << "NULL is not a valid simple regular expression.";
@@ -9345,7 +10369,6 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
     return file_name + ":" + StreamableToString(line);
 }
 
-
 GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
     : severity_(severity) {
   const char* const marker =
@@ -9364,9 +10387,10 @@ GTestLog::~GTestLog() {
     posix::Abort();
   }
 }
+
 // Disable Microsoft deprecation warnings for POSIX functions called from
 // this class (creat, dup, dup2, and close)
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
 
 #if GTEST_HAS_STREAM_REDIRECTION
 
@@ -9442,12 +10466,6 @@ class CapturedStream {
   }
 
  private:
-  // Reads the entire content of a file as an std::string.
-  static std::string ReadEntireFile(FILE* file);
-
-  // Returns the size (in bytes) of a file.
-  static size_t GetFileSize(FILE* file);
-
   const int fd_;  // A stream to capture.
   int uncaptured_fd_;
   // Name of the temporary file holding the stderr output.
@@ -9456,42 +10474,14 @@ class CapturedStream {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
 };
 
-// Returns the size (in bytes) of a file.
-size_t CapturedStream::GetFileSize(FILE* file) {
-  fseek(file, 0, SEEK_END);
-  return static_cast<size_t>(ftell(file));
-}
-
-// Reads the entire content of a file as a string.
-std::string CapturedStream::ReadEntireFile(FILE* file) {
-  const size_t file_size = GetFileSize(file);
-  char* const buffer = new char[file_size];
-
-  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
-  size_t bytes_read = 0;       // # of bytes read so far
-
-  fseek(file, 0, SEEK_SET);
-
-  // Keeps reading the file until we cannot read further or the
-  // pre-determined file size is reached.
-  do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
-    bytes_read += bytes_last_read;
-  } while (bytes_last_read > 0 && bytes_read < file_size);
-
-  const std::string content(buffer, bytes_read);
-  delete[] buffer;
-
-  return content;
-}
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
 static CapturedStream* g_captured_stderr = NULL;
 static CapturedStream* g_captured_stdout = NULL;
 
 // Starts capturing an output stream (stdout/stderr).
-void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+static void CaptureStream(int fd, const char* stream_name,
+                          CapturedStream** stream) {
   if (*stream != NULL) {
     GTEST_LOG_(FATAL) << "Only one " << stream_name
                       << " capturer can exist at a time.";
@@ -9500,7 +10490,7 @@ void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
 }
 
 // Stops capturing the output stream and returns the captured string.
-std::string GetCapturedStream(CapturedStream** captured_stream) {
+static std::string GetCapturedStream(CapturedStream** captured_stream) {
   const std::string content = (*captured_stream)->GetCapturedString();
 
   delete *captured_stream;
@@ -9531,25 +10521,67 @@ std::string GetCapturedStderr() {
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
-#if GTEST_HAS_DEATH_TEST
 
-// A copy of all command line arguments.  Set by InitGoogleTest().
-::std::vector<testing::internal::string> g_argvs;
 
-static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
-                                        NULL;  // Owned.
 
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
-  if (g_injected_test_argvs != argvs)
-    delete g_injected_test_argvs;
-  g_injected_test_argvs = argvs;
+
+size_t GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
 }
 
-const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+#if GTEST_HAS_DEATH_TEST
+static const std::vector<std::string>* g_injected_test_argvs = NULL;  // Owned.
+
+std::vector<std::string> GetInjectableArgvs() {
   if (g_injected_test_argvs != NULL) {
     return *g_injected_test_argvs;
   }
-  return g_argvs;
+  return GetArgvs();
+}
+
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs) {
+  if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
+  g_injected_test_argvs = new_argvs;
+}
+
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+
+#if GTEST_HAS_GLOBAL_STRING
+void SetInjectableArgvs(const std::vector< ::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void ClearInjectableArgvs() {
+  delete g_injected_test_argvs;
+  g_injected_test_argvs = NULL;
 }
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -9623,16 +10655,23 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
 //
 // The value is considered true iff it's not "0".
 bool BoolFromGTestEnv(const char* flag, bool default_value) {
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#else
   const std::string env_var = FlagToEnvVar(flag);
   const char* const string_value = posix::GetEnv(env_var.c_str());
   return string_value == NULL ?
       default_value : strcmp(string_value, "0") != 0;
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
 }
 
 // Reads and returns a 32-bit integer stored in the environment
 // variable corresponding to the given flag; if it isn't set or
 // doesn't represent a valid 32-bit integer, returns default_value.
 Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#else
   const std::string env_var = FlagToEnvVar(flag);
   const char* const string_value = posix::GetEnv(env_var.c_str());
   if (string_value == NULL) {
@@ -9650,14 +10689,36 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
   }
 
   return result;
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+}
+
+// As a special case for the 'output' flag, if GTEST_OUTPUT is not
+// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+// system.  The value of XML_OUTPUT_FILE is a filename without the
+// "xml:" prefix of GTEST_OUTPUT.
+// Note that this is meant to be called at the call site so it does
+// not check that the flag is 'output'
+// In essence this checks an env variable called XML_OUTPUT_FILE
+// and if it is set we prepend "xml:" to its value, if it not set we return ""
+std::string OutputFlagAlsoCheckEnvVar(){
+  std::string default_value_for_output_flag = "";
+  const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+  if (NULL != xml_output_file_env) {
+    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
+  }
+  return default_value_for_output_flag;
 }
 
 // Reads and returns the string environment variable corresponding to
 // the given flag; if it's not set, returns default_value.
 const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#else
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value = posix::GetEnv(env_var.c_str());
   return value == NULL ? default_value : value;
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
 }
 
 }  // namespace internal
@@ -9690,10 +10751,9 @@ const char* StringFromGTestEnv(const char* flag, const char* default_value) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
-// Google Test - The Google C++ Testing Framework
+
+// Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
 // value of any type T:
@@ -9706,8 +10766,8 @@ const char* StringFromGTestEnv(const char* flag, const char* default_value) {
 // or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
 // defines Foo.
 
-#include <ctype.h>
 #include <stdio.h>
+#include <cctype>
 #include <cwchar>
 #include <ostream>  // NOLINT
 #include <string>
@@ -9751,7 +10811,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
   // If the object size is bigger than kThreshold, we'll have to omit
   // some details by printing only the first and the last kChunkSize
   // bytes.
-  // TODO(wan): let the user control the threshold using a flag.
+  // FIXME: let the user control the threshold using a flag.
   if (count < kThreshold) {
     PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
   } else {
@@ -9785,7 +10845,7 @@ namespace internal {
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
 enum CharFormat {
   kAsIs,
@@ -9842,7 +10902,10 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
         *os << static_cast<char>(c);
         return kAsIs;
       } else {
-        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        ostream::fmtflags flags = os->flags();
+        *os << "\\x" << std::hex << std::uppercase
+            << static_cast<int>(static_cast<UnsignedChar>(c));
+        os->flags(flags);
         return kHexEscape;
       }
   }
@@ -9889,7 +10952,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
     return;
   *os << " (" << static_cast<int>(c);
 
-  // For more convenience, we print c's code again in hexidecimal,
+  // For more convenience, we print c's code again in hexadecimal,
   // unless c was already printed in the form '\x##' or the code is in
   // [1, 9].
   if (format == kHexEscape || (1 <= c && c <= 9)) {
@@ -9921,11 +10984,12 @@ template <typename CharType>
 GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void PrintCharsAsStringTo(
+static CharFormat PrintCharsAsStringTo(
     const CharType* begin, size_t len, ostream* os) {
   const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
   *os << kQuoteBegin;
   bool is_previous_hex = false;
+  CharFormat print_format = kAsIs;
   for (size_t index = 0; index < len; ++index) {
     const CharType cur = begin[index];
     if (is_previous_hex && IsXDigit(cur)) {
@@ -9935,8 +10999,13 @@ static void PrintCharsAsStringTo(
       *os << "\" " << kQuoteBegin;
     }
     is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+    // Remember if any characters required hex escaping.
+    if (is_previous_hex) {
+      print_format = kHexEscape;
+    }
   }
   *os << "\"";
+  return print_format;
 }
 
 // Prints a (const) char/wchar_t array of 'len' elements, starting at address
@@ -10006,15 +11075,90 @@ void PrintTo(const wchar_t* s, ostream* os) {
 }
 #endif  // wchar_t is native
 
+namespace {
+
+bool ContainsUnprintableControlCodes(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length; i++) {
+    unsigned char ch = *s++;
+    if (std::iscntrl(ch)) {
+        switch (ch) {
+        case '\t':
+        case '\n':
+        case '\r':
+          break;
+        default:
+          return true;
+        }
+      }
+  }
+  return false;
+}
+
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+
+bool IsValidUTF8(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length;) {
+    unsigned char lead = s[i++];
+
+    if (lead <= 0x7f) {
+      continue;  // single-byte character (ASCII) 0..7F
+    }
+    if (lead < 0xc2) {
+      return false;  // trail byte or non-shortest form
+    } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
+      ++i;  // 2-byte character
+    } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               // check for non-shortest form and surrogate
+               (lead != 0xe0 || s[i] >= 0xa0) &&
+               (lead != 0xed || s[i] < 0xa0)) {
+      i += 2;  // 3-byte character
+    } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i + 2]) &&
+               // check for non-shortest form
+               (lead != 0xf0 || s[i] >= 0x90) &&
+               (lead != 0xf4 || s[i] < 0x90)) {
+      i += 3;  // 4-byte character
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
+  if (!ContainsUnprintableControlCodes(str, length) &&
+      IsValidUTF8(str, length)) {
+    *os << "\n    As Text: \"" << str << "\"";
+  }
+}
+
+}  // anonymous namespace
+
 // Prints a ::string object.
 #if GTEST_HAS_GLOBAL_STRING
 void PrintStringTo(const ::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
 }
 #endif  // GTEST_HAS_GLOBAL_STRING
 
 void PrintStringTo(const ::std::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
 }
 
 // Prints a ::wstring object.
@@ -10061,19 +11205,10 @@ void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-// The Google C++ Testing Framework (Google Test)
 
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 
@@ -10169,8 +11304,8 @@ void HasNewFatalFailureHelper::ReportTestPartResult(
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
+
 
 
 namespace testing {
@@ -10196,11 +11331,11 @@ static std::vector<std::string> SplitIntoTestNames(const char* src) {
 }
 
 // Verifies that registered_tests match the test names in
-// defined_test_names_; returns registered_tests if successful, or
+// registered_tests_; returns registered_tests if successful, or
 // aborts the program otherwise.
 const char* TypedTestCasePState::VerifyRegisteredTestNames(
     const char* file, int line, const char* registered_tests) {
-  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
+  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
   registered_ = true;
 
   std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
@@ -10217,10 +11352,10 @@ const char* TypedTestCasePState::VerifyRegisteredTestNames(
     }
 
     bool found = false;
-    for (DefinedTestIter it = defined_test_names_.begin();
-         it != defined_test_names_.end();
+    for (RegisteredTestIter it = registered_tests_.begin();
+         it != registered_tests_.end();
          ++it) {
-      if (name == *it) {
+      if (name == it->first) {
         found = true;
         break;
       }
@@ -10234,11 +11369,11 @@ const char* TypedTestCasePState::VerifyRegisteredTestNames(
     }
   }
 
-  for (DefinedTestIter it = defined_test_names_.begin();
-       it != defined_test_names_.end();
+  for (RegisteredTestIter it = registered_tests_.begin();
+       it != registered_tests_.end();
        ++it) {
-    if (tests.count(*it) == 0) {
-      errors << "You forgot to list test " << *it << ".\n";
+    if (tests.count(it->first) == 0) {
+      errors << "You forgot to list test " << it->first << ".\n";
     }
   }
 
@@ -10257,5 +11392,3 @@ const char* TypedTestCasePState::VerifyRegisteredTestNames(
 
 }  // namespace internal
 }  // namespace testing
-
-#endif  // __clang_analyzer__
diff --git a/third-party/gtest-1.7.0/fused-src/gtest/gtest.h b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
similarity index 90%
rename from third-party/gtest-1.7.0/fused-src/gtest/gtest.h
rename to third-party/gtest-1.8.1/fused-src/gtest/gtest.h
index 3cec41a9e44..ebb16db7b09 100644
--- a/third-party/gtest-1.7.0/fused-src/gtest/gtest.h
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for Google Test.  It should be
 // included by any test program that uses Google Test.
@@ -48,6 +47,8 @@
 // registration from Barthelemy Dagenais' (barthelemy@prologique.com)
 // easyUnit framework.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_H_
 
@@ -84,13 +85,13 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares functions and macros used internally by
 // Google Test.  They are subject to change without notice.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 
@@ -123,8 +124,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan)
-//
 // Low-level types and utilities for porting Google Test to various
 // platforms.  All macros ending with _ and symbols defined in an
 // internal namespace are subject to change without notice.  Code
@@ -136,6 +135,8 @@
 // files are expected to #include this.  Therefore, it cannot #include
 // any other Google Test header.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
@@ -169,11 +170,9 @@
 //   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
 //                              are enabled.
 //   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::string, which is different to std::string).
-//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::wstring, which is different to std::wstring).
+//                              is/isn't available
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::wstring
+//                              is/isn't available
 //   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
 //                              expressions are/aren't available.
 //   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
@@ -205,6 +204,12 @@
 //   GTEST_CREATE_SHARED_LIBRARY
 //                            - Define to 1 when compiling Google Test itself
 //                              as a shared library.
+//   GTEST_DEFAULT_DEATH_TEST_STYLE
+//                            - The default value of --gtest_death_test_style.
+//                              The legacy default has been "fast" in the open
+//                              source version since 2008. The recommended value
+//                              is "threadsafe", and can be set in
+//                              custom/gtest-port.h.
 
 // Platform-indicating macros
 // --------------------------
@@ -218,12 +223,14 @@
 //   GTEST_OS_AIX      - IBM AIX
 //   GTEST_OS_CYGWIN   - Cygwin
 //   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_FUCHSIA  - Fuchsia
 //   GTEST_OS_HPUX     - HP-UX
 //   GTEST_OS_LINUX    - Linux
 //     GTEST_OS_LINUX_ANDROID - Google Android
 //   GTEST_OS_MAC      - Mac OS X
 //     GTEST_OS_IOS    - iOS
 //   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_NETBSD   - NetBSD
 //   GTEST_OS_OPENBSD  - OpenBSD
 //   GTEST_OS_QNX      - QNX
 //   GTEST_OS_SOLARIS  - Sun Solaris
@@ -265,15 +272,15 @@
 //   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
 //                            tests)
 //   GTEST_HAS_DEATH_TEST   - death tests
-//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
 //   GTEST_HAS_TYPED_TEST   - typed tests
 //   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
 //   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GOOGLETEST_CM0007 DO NOT DELETE
 //   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
 //                            GTEST_HAS_POSIX_RE (see above) which users can
 //                            define themselves.
 //   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
-//                            the above two are mutually exclusive.
+//                            the above RE\b(s) are mutually exclusive.
 //   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
 
 // Misc public macros
@@ -302,6 +309,7 @@
 //
 // C++11 feature wrappers:
 //
+//   testing::internal::forward - portability wrapper for std::forward.
 //   testing::internal::move  - portability wrapper for std::move.
 //
 // Synchronization:
@@ -318,10 +326,10 @@
 //
 // Regular expressions:
 //   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like
-//                    platforms, or a reduced regular exception syntax on
-//                    other platforms, including Windows.
-//
+//                    Extended Regular Expression syntax on UNIX-like platforms
+//                    GOOGLETEST_CM0008 DO NOT DELETE
+//                    or a reduced regular exception syntax on other
+//                    platforms, including Windows.
 // Logging:
 //   GTEST_LOG_()   - logs messages at the specified severity level.
 //   LogToStderr()  - directs all log messages to stderr.
@@ -367,25 +375,51 @@
 # include <TargetConditionals.h>
 #endif
 
+// Brings in the definition of HAS_GLOBAL_STRING.  This must be done
+// BEFORE we test HAS_GLOBAL_STRING.
+#include <string>  // NOLINT
 #include <algorithm>  // NOLINT
 #include <iostream>  // NOLINT
 #include <sstream>  // NOLINT
-#include <string>  // NOLINT
 #include <utility>
+#include <vector>  // NOLINT
 
-#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-#define GTEST_FLAG_PREFIX_ "gtest_"
-#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-#define GTEST_NAME_ "Google Test"
-#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
 
-// Determines the version of gcc that is used to compile this.
-#ifdef __GNUC__
-// 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif  // __GNUC__
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
@@ -406,6 +440,9 @@
 #   define GTEST_OS_WINDOWS_PHONE 1
 #  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
 #   define GTEST_OS_WINDOWS_RT 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#   define GTEST_OS_WINDOWS_TV_TITLE 1
 #  else
     // WINAPI_FAMILY defined but no known partition matched.
     // Default to desktop.
@@ -421,6 +458,8 @@
 # endif
 #elif defined __FreeBSD__
 # define GTEST_OS_FREEBSD 1
+#elif defined __Fuchsia__
+# define GTEST_OS_FUCHSIA 1
 #elif defined __linux__
 # define GTEST_OS_LINUX 1
 # if defined __ANDROID__
@@ -436,18 +475,79 @@
 # define GTEST_OS_HPUX 1
 #elif defined __native_client__
 # define GTEST_OS_NACL 1
+#elif defined __NetBSD__
+# define GTEST_OS_NETBSD 1
 #elif defined __OpenBSD__
 # define GTEST_OS_OPENBSD 1
 #elif defined __QNX__
 # define GTEST_OS_QNX 1
 #endif  // __CYGWIN__
 
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#if !defined(GTEST_DEV_EMAIL_)
+# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+# define GTEST_FLAG_PREFIX_ "gtest_"
+# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+# define GTEST_NAME_ "Google Test"
+# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif  // !defined(GTEST_DEV_EMAIL_)
+
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
 // Macros for disabling Microsoft Visual C++ warnings.
 //
 //   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
 //   /* code that triggers warnings C4800 and C4385 */
 //   GTEST_DISABLE_MSC_WARNINGS_POP_()
-#if _MSC_VER >= 1500
+#if _MSC_VER >= 1400
 # define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
     __pragma(warning(push))                        \
     __pragma(warning(disable: warnings))
@@ -459,12 +559,28 @@
 # define GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
+// Clang on Windows does not understand MSVC's pragma warning.
+// We need clang-specific way to disable function deprecation warning.
+#ifdef __clang__
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
+    _Pragma("clang diagnostic push")                                  \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    _Pragma("clang diagnostic pop")
+#else
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
 #ifndef GTEST_LANG_CXX11
 // gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
 // -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
 // value for __cplusplus, and recent versions of clang, gcc, and
 // probably other compilers set that too in C++11 mode.
-# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L || _MSC_VER >= 1900
 // Compiling in at least C++11 mode.
 #  define GTEST_LANG_CXX11 1
 # else
@@ -496,10 +612,16 @@
 #if GTEST_STDLIB_CXX11
 # define GTEST_HAS_STD_BEGIN_AND_END_ 1
 # define GTEST_HAS_STD_FORWARD_LIST_ 1
-# define GTEST_HAS_STD_FUNCTION_ 1
+# if !defined(_MSC_VER) || (_MSC_FULL_VER >= 190023824)
+// works only with VS2015U2 and better
+#   define GTEST_HAS_STD_FUNCTION_ 1
+# endif
 # define GTEST_HAS_STD_INITIALIZER_LIST_ 1
 # define GTEST_HAS_STD_MOVE_ 1
 # define GTEST_HAS_STD_UNIQUE_PTR_ 1
+# define GTEST_HAS_STD_SHARED_PTR_ 1
+# define GTEST_HAS_UNORDERED_MAP_ 1
+# define GTEST_HAS_UNORDERED_SET_ 1
 #endif
 
 // C++11 specifies that <tuple> provides std::tuple.
@@ -507,7 +629,8 @@
 #if GTEST_LANG_CXX11
 # define GTEST_HAS_STD_TUPLE_ 1
 # if defined(__clang__)
-// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include
+// Inspired by
+// https://clang.llvm.org/docs/LanguageExtensions.html#include-file-checking-macros
 #  if defined(__has_include) && !__has_include(<tuple>)
 #   undef GTEST_HAS_STD_TUPLE_
 #  endif
@@ -519,7 +642,7 @@
 # elif defined(__GLIBCXX__)
 // Inspired by boost/config/stdlib/libstdcpp3.hpp,
 // http://gcc.gnu.org/gcc-4.2/changes.html and
-// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
+// https://web.archive.org/web/20140227044429/gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
 #  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
 #   undef GTEST_HAS_STD_TUPLE_
 #  endif
@@ -535,10 +658,16 @@
 #  include <io.h>
 # endif
 // In order to avoid having to include <windows.h>, use forward declaration
-// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
+// separate (equivalent) structs, instead of using typedef
+typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#else
+// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
 // This assumption is verified by
 // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
-struct _RTL_CRITICAL_SECTION;
+typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#endif
 #else
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
@@ -562,7 +691,10 @@ struct _RTL_CRITICAL_SECTION;
 # endif
 #endif
 
-#if GTEST_HAS_POSIX_RE
+#if GTEST_USES_PCRE
+// The appropriate headers have already been included.
+
+#elif GTEST_HAS_POSIX_RE
 
 // On some platforms, <regex.h> needs someone to define size_t, and
 // won't compile otherwise.  We can #include it here as we already
@@ -584,13 +716,16 @@ struct _RTL_CRITICAL_SECTION;
 // simple regex implementation instead.
 # define GTEST_USES_SIMPLE_RE 1
 
-#endif  // GTEST_HAS_POSIX_RE
+#endif  // GTEST_USES_PCRE
 
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
 // to figure it out.
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+# if defined(_MSC_VER) && defined(_CPPUNWIND)
+// MSVC defines _CPPUNWIND to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__BORLANDC__)
+// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
 // macro to enable exceptions, so we'll do the same.
 // Assumes that exceptions are enabled by default.
 #  ifndef _HAS_EXCEPTIONS
@@ -634,21 +769,17 @@ struct _RTL_CRITICAL_SECTION;
 # define GTEST_HAS_STD_STRING 1
 #elif !GTEST_HAS_STD_STRING
 // The user told us that ::std::string isn't available.
-# error "Google Test cannot be used where ::std::string isn't available."
+# error "::std::string isn't available."
 #endif  // !defined(GTEST_HAS_STD_STRING)
 
 #ifndef GTEST_HAS_GLOBAL_STRING
-// The user didn't tell us whether ::string is available, so we need
-// to figure it out.
-
 # define GTEST_HAS_GLOBAL_STRING 0
-
 #endif  // GTEST_HAS_GLOBAL_STRING
 
 #ifndef GTEST_HAS_STD_WSTRING
 // The user didn't tell us whether ::std::wstring is available, so we need
 // to figure it out.
-// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
+// FIXME: uses autoconf to detect whether ::std::wstring
 //   is available.
 
 // Cygwin 1.7 and below doesn't support ::std::wstring.
@@ -736,8 +867,9 @@ struct _RTL_CRITICAL_SECTION;
 //
 // To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
 // to your compiler flags.
-# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
-    || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL)
+#define GTEST_HAS_PTHREAD                                             \
+  (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \
+   GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA)
 #endif  // GTEST_HAS_PTHREAD
 
 #if GTEST_HAS_PTHREAD
@@ -749,6 +881,15 @@ struct _RTL_CRITICAL_SECTION;
 # include <time.h>  // NOLINT
 #endif
 
+// Determines if hash_map/hash_set are available.
+// Only used for testing against those containers.
+#if !defined(GTEST_HAS_HASH_MAP_)
+# if defined(_MSC_VER) && (_MSC_VER < 1900)
+#  define GTEST_HAS_HASH_MAP_ 1  // Indicates that hash_map is available.
+#  define GTEST_HAS_HASH_SET_ 1  // Indicates that hash_set is available.
+# endif  // _MSC_VER
+#endif  // !defined(GTEST_HAS_HASH_MAP_)
+
 // Determines whether Google Test can use tr1/tuple.  You can define
 // this macro to 0 to prevent Google Test from using tuple (any
 // feature depending on tuple with be disabled in this mode).
@@ -756,6 +897,14 @@ struct _RTL_CRITICAL_SECTION;
 # if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
 // STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
 #  define GTEST_HAS_TR1_TUPLE 0
+# elif defined(_MSC_VER) && (_MSC_VER >= 1910)
+// Prevent `warning C4996: 'std::tr1': warning STL4002:
+// The non-Standard std::tr1 namespace and TR1-only machinery
+// are deprecated and will be REMOVED.`
+#  define GTEST_HAS_TR1_TUPLE 0
+# elif GTEST_LANG_CXX11 && defined(_LIBCPP_VERSION)
+// libc++ doesn't support TR1.
+#  define GTEST_HAS_TR1_TUPLE 0
 # else
 // The user didn't tell us not to do it, so we assume it's OK.
 #  define GTEST_HAS_TR1_TUPLE 1
@@ -765,6 +914,10 @@ struct _RTL_CRITICAL_SECTION;
 // Determines whether Google Test's own tr1 tuple implementation
 // should be used.
 #ifndef GTEST_USE_OWN_TR1_TUPLE
+// We use our own tuple implementation on Symbian.
+# if GTEST_OS_SYMBIAN
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# else
 // The user didn't tell us, so we need to figure it out.
 
 // We use our own TR1 tuple if we aren't sure the user has an
@@ -778,7 +931,8 @@ struct _RTL_CRITICAL_SECTION;
 // support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
 // and it can be used with some compilers that define __GNUC__.
 # if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
-      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) \
+      || (_MSC_VER >= 1600 && _MSC_VER < 1900)
 #  define GTEST_ENV_HAS_TR1_TUPLE_ 1
 # endif
 
@@ -794,12 +948,11 @@ struct _RTL_CRITICAL_SECTION;
 # else
 #  define GTEST_USE_OWN_TR1_TUPLE 1
 # endif
-
+# endif  // GTEST_OS_SYMBIAN
 #endif  // GTEST_USE_OWN_TR1_TUPLE
 
-// To avoid conditional compilation everywhere, we make it
-// gtest-port.h's responsibility to #include the header implementing
-// tuple.
+// To avoid conditional compilation we make it gtest-port.h's responsibility
+// to #include the header implementing tuple.
 #if GTEST_HAS_STD_TUPLE_
 # include <tuple>  // IWYU pragma: export
 # define GTEST_TUPLE_NAMESPACE_ ::std
@@ -845,11 +998,12 @@ struct _RTL_CRITICAL_SECTION;
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 // Implements a subset of TR1 tuple needed by Google Test and Google Mock.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
 
@@ -857,7 +1011,7 @@ struct _RTL_CRITICAL_SECTION;
 
 // The compiler used in Symbian has a bug that prevents us from declaring the
 // tuple template as a friend (it complains that tuple is redefined).  This
-// hack bypasses the bug by declaring the members that should otherwise be
+// bypasses the bug by declaring the members that should otherwise be
 // private as public.
 // Sun Studio versions < 12 also have the above bug.
 #if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
@@ -1833,22 +1987,6 @@ inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
 #undef GTEST_TUPLE_ELEMENT_
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-# elif GTEST_ENV_HAS_STD_TUPLE_
-#  include <tuple>
-// C++11 puts its tuple into the ::std namespace rather than
-// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
-// This causes undefined behavior, but supported compilers react in
-// the way we intend.
-namespace std {
-namespace tr1 {
-using ::std::get;
-using ::std::make_tuple;
-using ::std::tuple;
-using ::std::tuple_element;
-using ::std::tuple_size;
-}
-}
-
 # elif GTEST_OS_SYMBIAN
 
 // On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
@@ -1873,20 +2011,22 @@ using ::std::tuple_size;
 // Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
 // which is #included by <tr1/tuple>, to not compile when RTTI is
 // disabled.  _TR1_FUNCTIONAL is the header guard for
-// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional>.  Hence the following #define is used to prevent
 // <tr1/functional> from being included.
 #   define _TR1_FUNCTIONAL 1
 #   include <tr1/tuple>
 #   undef _TR1_FUNCTIONAL  // Allows the user to #include
-                        // <tr1/functional> if he chooses to.
+                        // <tr1/functional> if they choose to.
 #  else
 #   include <tr1/tuple>  // NOLINT
 #  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
 
-# else
-// If the compiler is not GCC 4.0+, we assume the user is using a
-// spec-conforming TR1 implementation.
+// VS 2010 now has tr1 support.
+# elif _MSC_VER >= 1600
 #  include <tuple>  // IWYU pragma: export  // NOLINT
+
+# else  // GTEST_USE_OWN_TR1_TUPLE
+#  include <tr1/tuple>  // IWYU pragma: export  // NOLINT
 # endif  // GTEST_USE_OWN_TR1_TUPLE
 
 #endif  // GTEST_HAS_TR1_TUPLE
@@ -1900,8 +2040,12 @@ using ::std::tuple_size;
 
 # if GTEST_OS_LINUX && !defined(__ia64__)
 #  if GTEST_OS_LINUX_ANDROID
-// On Android, clone() is only available on ARM starting with Gingerbread.
-#    if defined(__arm__) && __ANDROID_API__ >= 9
+// On Android, clone() became available at different API levels for each 32-bit
+// architecture.
+#    if defined(__LP64__) || \
+        (defined(__arm__) && __ANDROID_API__ >= 9) || \
+        (defined(__mips__) && __ANDROID_API__ >= 12) || \
+        (defined(__i386__) && __ANDROID_API__ >= 17)
 #     define GTEST_HAS_CLONE 1
 #    else
 #     define GTEST_HAS_CLONE 0
@@ -1932,20 +2076,15 @@ using ::std::tuple_size;
 // Google Test does not support death tests for VC 7.1 and earlier as
 // abort() in a VC 7.1 application compiled as GUI in debug config
 // pops up a dialog window that cannot be suppressed programmatically.
-#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     (GTEST_OS_MAC && !GTEST_OS_IOS) || \
-     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS ||   \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) ||                         \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) ||          \
      GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
-     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD)
+     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD || \
+     GTEST_OS_NETBSD || GTEST_OS_FUCHSIA)
 # define GTEST_HAS_DEATH_TEST 1
-# include <vector>  // NOLINT
 #endif
 
-// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
-// all the compilers we care about are adequate for supporting
-// value-parameterized tests.
-#define GTEST_HAS_PARAM_TEST 1
-
 // Determines whether to support type-driven tests.
 
 // Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
@@ -1960,7 +2099,7 @@ using ::std::tuple_size;
 // value-parameterized tests are enabled.  The implementation doesn't
 // work on Sun Studio since it doesn't understand templated conversion
 // operators.
-#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+#if (GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_) && !defined(__SUNPRO_CC)
 # define GTEST_HAS_COMBINE 1
 #endif
 
@@ -2011,15 +2150,39 @@ using ::std::tuple_size;
 # define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
+#if GTEST_LANG_CXX11
+# define GTEST_CXX11_EQUALS_DELETE_ = delete
+#else  // GTEST_LANG_CXX11
+# define GTEST_CXX11_EQUALS_DELETE_
+#endif  // GTEST_LANG_CXX11
+
+// Use this annotation before a function that takes a printf format string.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
+# if defined(__MINGW_PRINTF_FORMAT)
+// MinGW has two different printf implementations. Ensure the format macro
+// matches the selected implementation. See
+// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
+                                 first_to_check)))
+# else
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__printf__, string_index, first_to_check)))
+# endif
+#else
+# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#endif
+
+
 // A macro to disallow operator=
 // This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type)\
-  void operator=(type const &)
+#define GTEST_DISALLOW_ASSIGN_(type) \
+  void operator=(type const &) GTEST_CXX11_EQUALS_DELETE_
 
 // A macro to disallow copy constructor and operator=
 // This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
-  type(type const &);\
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
+  type(type const &) GTEST_CXX11_EQUALS_DELETE_; \
   GTEST_DISALLOW_ASSIGN_(type)
 
 // Tell the compiler to warn about unused return values for functions declared
@@ -2061,25 +2224,36 @@ using ::std::tuple_size;
 # endif
 
 #define GTEST_IS_THREADSAFE \
-    (0 \
+    (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \
      || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \
      || GTEST_HAS_PTHREAD)
 
 #endif  // GTEST_HAS_SEH
 
-#ifdef _MSC_VER
+// GTEST_API_ qualifies all symbols that must be exported. The definitions below
+// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
+// gtest/internal/custom/gtest-port.h
+#ifndef GTEST_API_
 
+#ifdef _MSC_VER
 # if GTEST_LINKED_AS_SHARED_LIBRARY
 #  define GTEST_API_ __declspec(dllimport)
 # elif GTEST_CREATE_SHARED_LIBRARY
 #  define GTEST_API_ __declspec(dllexport)
 # endif
-
+#elif __GNUC__ >= 4 || defined(__clang__)
+# define GTEST_API_ __attribute__((visibility ("default")))
 #endif  // _MSC_VER
 
+#endif  // GTEST_API_
+
 #ifndef GTEST_API_
 # define GTEST_API_
-#endif
+#endif  // GTEST_API_
+
+#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
+# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
+#endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
 
 #ifdef __GNUC__
 // Ask the compiler to never inline a given function.
@@ -2089,10 +2263,12 @@ using ::std::tuple_size;
 #endif
 
 // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
-#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
-# define GTEST_HAS_CXXABI_H_ 1
-#else
-# define GTEST_HAS_CXXABI_H_ 0
+#if !defined(GTEST_HAS_CXXABI_H_)
+# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#  define GTEST_HAS_CXXABI_H_ 1
+# else
+#  define GTEST_HAS_CXXABI_H_ 0
+# endif
 #endif
 
 // A function level attribute to disable checking for use of uninitialized
@@ -2235,6 +2411,16 @@ struct StaticAssertTypeEqHelper<T, T> {
   enum { value = true };
 };
 
+// Same as std::is_same<>.
+template <typename T, typename U>
+struct IsSame {
+  enum { value = false };
+};
+template <typename T>
+struct IsSame<T, T> {
+  enum { value = true };
+};
+
 // Evaluates to the number of elements in 'array'.
 #define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
 
@@ -2298,6 +2484,10 @@ class scoped_ptr {
 
 // Defines RE.
 
+#if GTEST_USES_PCRE
+// if used, PCRE is injected by custom/gtest-port.h
+#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
+
 // A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
 // Regular Expression syntax.
 class GTEST_API_ RE {
@@ -2309,11 +2499,11 @@ class GTEST_API_ RE {
   // Constructs an RE from a string.
   RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
 
-#if GTEST_HAS_GLOBAL_STRING
+# if GTEST_HAS_GLOBAL_STRING
 
   RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
 
-#endif  // GTEST_HAS_GLOBAL_STRING
+# endif  // GTEST_HAS_GLOBAL_STRING
 
   RE(const char* regex) { Init(regex); }  // NOLINT
   ~RE();
@@ -2326,7 +2516,7 @@ class GTEST_API_ RE {
   // PartialMatch(str, re) returns true iff regular expression re
   // matches a substring of str (including str itself).
   //
-  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
+  // FIXME: make FullMatch() and PartialMatch() work
   // when str contains NUL characters.
   static bool FullMatch(const ::std::string& str, const RE& re) {
     return FullMatch(str.c_str(), re);
@@ -2335,7 +2525,7 @@ class GTEST_API_ RE {
     return PartialMatch(str.c_str(), re);
   }
 
-#if GTEST_HAS_GLOBAL_STRING
+# if GTEST_HAS_GLOBAL_STRING
 
   static bool FullMatch(const ::string& str, const RE& re) {
     return FullMatch(str.c_str(), re);
@@ -2344,7 +2534,7 @@ class GTEST_API_ RE {
     return PartialMatch(str.c_str(), re);
   }
 
-#endif  // GTEST_HAS_GLOBAL_STRING
+# endif  // GTEST_HAS_GLOBAL_STRING
 
   static bool FullMatch(const char* str, const RE& re);
   static bool PartialMatch(const char* str, const RE& re);
@@ -2353,25 +2543,27 @@ class GTEST_API_ RE {
   void Init(const char* regex);
 
   // We use a const char* instead of an std::string, as Google Test used to be
-  // used where std::string is not available.  TODO(wan@google.com): change to
+  // used where std::string is not available.  FIXME: change to
   // std::string.
   const char* pattern_;
   bool is_valid_;
 
-#if GTEST_USES_POSIX_RE
+# if GTEST_USES_POSIX_RE
 
   regex_t full_regex_;     // For FullMatch().
   regex_t partial_regex_;  // For PartialMatch().
 
-#else  // GTEST_USES_SIMPLE_RE
+# else  // GTEST_USES_SIMPLE_RE
 
   const char* full_pattern_;  // For FullMatch();
 
-#endif
+# endif
 
   GTEST_DISALLOW_ASSIGN_(RE);
 };
 
+#endif  // GTEST_USES_PCRE
+
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
 GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
@@ -2413,13 +2605,18 @@ class GTEST_API_ GTestLog {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
 };
 
-#define GTEST_LOG_(severity) \
+#if !defined(GTEST_LOG_)
+
+# define GTEST_LOG_(severity) \
     ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
                                   __FILE__, __LINE__).GetStream()
 
 inline void LogToStderr() {}
 inline void FlushInfoLog() { fflush(NULL); }
 
+#endif  // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
 // INTERNAL IMPLEMENTATION - DO NOT USE.
 //
 // GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
@@ -2434,12 +2631,13 @@ inline void FlushInfoLog() { fflush(NULL); }
 //    condition itself, plus additional message streamed into it, if any,
 //    and then it aborts the program. It aborts the program irrespective of
 //    whether it is built in the debug mode or not.
-#define GTEST_CHECK_(condition) \
+# define GTEST_CHECK_(condition) \
     GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
     if (::testing::internal::IsTrue(condition)) \
       ; \
     else \
       GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif  // !defined(GTEST_CHECK_)
 
 // An all-mode assert to verify that the given POSIX-style function
 // call returns 0 (indicating success).  Known limitation: this
@@ -2451,13 +2649,59 @@ inline void FlushInfoLog() { fflush(NULL); }
     GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
                       << gtest_error
 
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Transforms "T" into "const T&" according to standard reference collapsing
+// rules (this is only needed as a backport for C++98 compilers that do not
+// support reference collapsing). Specifically, it transforms:
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> char&
+//   const char&  ==> const char&
+//
+// Note that the non-const reference will not have "const" added. This is
+// standard, and necessary so that "T" can always bind to "const T&".
+template <typename T>
+struct ConstRef { typedef const T& type; };
+template <typename T>
+struct ConstRef<T&> { typedef T& type; };
+
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+  typename ::testing::internal::ConstRef<T>::type
+
 #if GTEST_HAS_STD_MOVE_
+using std::forward;
 using std::move;
+
+template <typename T>
+struct RvalueRef {
+  typedef T&& type;
+};
 #else  // GTEST_HAS_STD_MOVE_
 template <typename T>
 const T& move(const T& t) {
   return t;
 }
+template <typename T>
+GTEST_ADD_REFERENCE_(T) forward(GTEST_ADD_REFERENCE_(T) t) { return t; }
+
+template <typename T>
+struct RvalueRef {
+  typedef const T& type;
+};
 #endif  // GTEST_HAS_STD_MOVE_
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -2481,7 +2725,7 @@ const T& move(const T& t) {
 // similar functions users may have (e.g., implicit_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
 template<typename To>
-inline To ImplicitCast_(To x) { return ::testing::internal::move(x); }
+inline To ImplicitCast_(To x) { return x; }
 
 // When you upcast (that is, cast a pointer from type Foo to type
 // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
@@ -2533,6 +2777,11 @@ template <class Derived, class Base>
 Derived* CheckedDowncastToActualType(Base* base) {
 #if GTEST_HAS_RTTI
   GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+  return ::down_cast<Derived*>(base);
+#elif GTEST_HAS_RTTI
   return dynamic_cast<Derived*>(base);  // NOLINT
 #else
   return static_cast<Derived*>(base);  // Poor man's downcast.
@@ -2553,16 +2802,25 @@ GTEST_API_ void CaptureStderr();
 GTEST_API_ std::string GetCapturedStderr();
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE* file);
 
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE* file);
 
-#if GTEST_HAS_DEATH_TEST
+// All command line arguments.
+GTEST_API_ std::vector<std::string> GetArgvs();
 
-const ::std::vector<testing::internal::string>& GetInjectableArgvs();
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
-                             new_argvs);
+#if GTEST_HAS_DEATH_TEST
 
-// A copy of all command line arguments.  Set by InitGoogleTest().
-extern ::std::vector<testing::internal::string> g_argvs;
+std::vector<std::string> GetInjectableArgvs();
+// Deprecated: pass the args vector by value instead.
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs);
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs);
+#if GTEST_HAS_GLOBAL_STRING
+void SetInjectableArgvs(const std::vector< ::string>& new_argvs);
+#endif  // GTEST_HAS_GLOBAL_STRING
+void ClearInjectableArgvs();
 
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -2581,7 +2839,10 @@ inline void SleepMilliseconds(int n) {
 }
 # endif  // GTEST_HAS_PTHREAD
 
-# if 0  // OS detection
+# if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
 # elif GTEST_HAS_PTHREAD
 // Allows a controller thread to pause execution of newly created
 // threads until notified.  Instances of this class must be created
@@ -2675,7 +2936,7 @@ class GTEST_API_ Notification {
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
 };
-# endif  // OS detection
+# endif  // GTEST_HAS_NOTIFICATION_
 
 // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
 // defined, but we don't want to use MinGW's pthreads implementation, which
@@ -2758,9 +3019,13 @@ class ThreadWithParam : public ThreadWithParamBase {
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
 };
-# endif  // GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
 
-# if 0  // OS detection
 # elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 // Mutex implements mutex on Windows platforms.  It is used in conjunction
@@ -2805,7 +3070,7 @@ class GTEST_API_ Mutex {
   // Initializes owner_thread_id_ and critical_section_ in static mutexes.
   void ThreadSafeLazyInit();
 
-  // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx,
+  // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
   // we assume that 0 is an invalid value for thread IDs.
   unsigned int owner_thread_id_;
 
@@ -2813,7 +3078,7 @@ class GTEST_API_ Mutex {
   // by the linker.
   MutexType type_;
   long critical_section_init_phase_;  // NOLINT
-  _RTL_CRITICAL_SECTION* critical_section_;
+  GTEST_CRITICAL_SECTION* critical_section_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
 };
@@ -2965,8 +3230,9 @@ class ThreadWithParam : public ThreadWithParamBase {
 template <typename T>
 class ThreadLocal : public ThreadLocalBase {
  public:
-  ThreadLocal() : default_() {}
-  explicit ThreadLocal(const T& value) : default_(value) {}
+  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : default_factory_(new InstanceValueHolderFactory(value)) {}
 
   ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
 
@@ -2980,6 +3246,7 @@ class ThreadLocal : public ThreadLocalBase {
   // knowing the type of T.
   class ValueHolder : public ThreadLocalValueHolderBase {
    public:
+    ValueHolder() : value_() {}
     explicit ValueHolder(const T& value) : value_(value) {}
 
     T* pointer() { return &value_; }
@@ -2996,10 +3263,42 @@ class ThreadLocal : public ThreadLocalBase {
   }
 
   virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
-    return new ValueHolder(default_);
+    return default_factory_->MakeNewHolder();
   }
 
-  const T default_;  // The default value for each thread.
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  scoped_ptr<ValueHolderFactory> default_factory_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
 };
@@ -3060,8 +3359,8 @@ class MutexBase {
 // particular, the owner_ field (a pthread_t) is not explicitly initialized.
 // This allows initialization to work whether pthread_t is a scalar or struct.
 // The flag -Wmissing-field-initializers must not be specified for this to work.
-#  define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-     ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false }
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0}
 
 // The Mutex class can only be used for mutexes created at runtime. It
 // shares its API with MutexBase otherwise.
@@ -3118,12 +3417,13 @@ extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
 
 // Implements thread-local storage on pthreads-based systems.
 template <typename T>
-class ThreadLocal {
+class GTEST_API_ ThreadLocal {
  public:
-  ThreadLocal() : key_(CreateKey()),
-                  default_() {}
-  explicit ThreadLocal(const T& value) : key_(CreateKey()),
-                                         default_(value) {}
+  ThreadLocal()
+      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : key_(CreateKey()),
+        default_factory_(new InstanceValueHolderFactory(value)) {}
 
   ~ThreadLocal() {
     // Destroys the managed object for the current thread, if any.
@@ -3143,6 +3443,7 @@ class ThreadLocal {
   // Holds a value of type T.
   class ValueHolder : public ThreadLocalValueHolderBase {
    public:
+    ValueHolder() : value_() {}
     explicit ValueHolder(const T& value) : value_(value) {}
 
     T* pointer() { return &value_; }
@@ -3168,20 +3469,52 @@ class ThreadLocal {
       return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
     }
 
-    ValueHolder* const new_holder = new ValueHolder(default_);
+    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
     ThreadLocalValueHolderBase* const holder_base = new_holder;
     GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
     return new_holder->pointer();
   }
 
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
   // A key pthreads uses for looking up per-thread values.
   const pthread_key_t key_;
-  const T default_;  // The default value for each thread.
+  scoped_ptr<ValueHolderFactory> default_factory_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
 };
 
-# endif  // OS detection
+# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
 #else  // GTEST_IS_THREADSAFE
 
@@ -3216,7 +3549,7 @@ class GTestMutexLock {
 typedef GTestMutexLock MutexLock;
 
 template <typename T>
-class ThreadLocal {
+class GTEST_API_ ThreadLocal {
  public:
   ThreadLocal() : value_() {}
   explicit ThreadLocal(const T& value) : value_(value) {}
@@ -3235,12 +3568,13 @@ class ThreadLocal {
 GTEST_API_ size_t GetThreadCount();
 
 // Passing non-POD classes through ellipsis (...) crashes the ARM
-// compiler and generates a warning in Sun Studio.  The Nokia Symbian
+// compiler and generates a warning in Sun Studio before 12u4. The Nokia Symbian
 // and the IBM XL C/C++ compiler try to instantiate a copy constructor
 // for objects passed through ellipsis (...), failing for uncopyable
 // objects.  We define this to ensure that only POD is passed through
 // ellipsis on these systems.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || \
+     (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5130)
 // We lose support for NULL detection where the compiler doesn't like
 // passing non-POD classes through ellipsis (...).
 # define GTEST_ELLIPSIS_NEEDS_POD_ 1
@@ -3266,6 +3600,13 @@ template <bool bool_value> const bool bool_constant<bool_value>::value;
 typedef bool_constant<false> false_type;
 typedef bool_constant<true> true_type;
 
+template <typename T, typename U>
+struct is_same : public false_type {};
+
+template <typename T>
+struct is_same<T, T> : public true_type {};
+
+
 template <typename T>
 struct is_pointer : public false_type {};
 
@@ -3277,6 +3618,7 @@ struct IteratorTraits {
   typedef typename Iterator::value_type value_type;
 };
 
+
 template <typename T>
 struct IteratorTraits<T*> {
   typedef T value_type;
@@ -3408,7 +3750,11 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 // Functions deprecated by MSVC 8.0.
 
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
 
 // ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
 // StrError() aren't needed on Windows CE at this time and thus not
@@ -3438,7 +3784,7 @@ inline int Close(int fd) { return close(fd); }
 inline const char* StrError(int errnum) { return strerror(errnum); }
 #endif
 inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
   // We are on Windows CE, which has no environment variables.
   static_cast<void>(name);  // To prevent 'unused argument' warning.
   return NULL;
@@ -3452,7 +3798,7 @@ inline const char* GetEnv(const char* name) {
 #endif
 }
 
-GTEST_DISABLE_MSC_WARNINGS_POP_()
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
 #if GTEST_OS_WINDOWS_MOBILE
 // Windows CE has no C library. The abort() function is used in
@@ -3553,31 +3899,44 @@ typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
 // Utilities for command line flags and environment variables.
 
 // Macro for referencing flags.
-#define GTEST_FLAG(name) FLAGS_gtest_##name
+#if !defined(GTEST_FLAG)
+# define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif  // !defined(GTEST_FLAG)
+
+#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+
+#if !defined(GTEST_DECLARE_bool_)
+# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
 
 // Macros for declaring flags.
-#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-#define GTEST_DECLARE_int32_(name) \
+# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+# define GTEST_DECLARE_int32_(name) \
     GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
-#define GTEST_DECLARE_string_(name) \
+# define GTEST_DECLARE_string_(name) \
     GTEST_API_ extern ::std::string GTEST_FLAG(name)
 
 // Macros for defining flags.
-#define GTEST_DEFINE_bool_(name, default_val, doc) \
+# define GTEST_DEFINE_bool_(name, default_val, doc) \
     GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_int32_(name, default_val, doc) \
+# define GTEST_DEFINE_int32_(name, default_val, doc) \
     GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_string_(name, default_val, doc) \
+# define GTEST_DEFINE_string_(name, default_val, doc) \
     GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
 
+#endif  // !defined(GTEST_DECLARE_bool_)
+
 // Thread annotations
-#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-#define GTEST_LOCK_EXCLUDED_(locks)
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+# define GTEST_LOCK_EXCLUDED_(locks)
+#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
 
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
 // to *value and returns true; otherwise leaves *value unchanged and returns
 // false.
-// TODO(chandlerc): Find a better way to refactor flag and environment parsing
+// FIXME: Find a better way to refactor flag and environment parsing
 // out of both gtest-port.cc and gtest.cc to avoid exporting this utility
 // function.
 bool ParseInt32(const Message& src_text, const char* str, Int32* value);
@@ -3586,6 +3945,7 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value);
 // corresponding to the given Google Test flag.
 bool BoolFromGTestEnv(const char* flag, bool default_val);
 GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+std::string OutputFlagAlsoCheckEnvVar();
 const char* StringFromGTestEnv(const char* flag, const char* default_val);
 
 }  // namespace internal
@@ -3593,7 +3953,6 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
-
 #if GTEST_OS_LINUX
 # include <stdlib.h>
 # include <sys/types.h>
@@ -3610,6 +3969,7 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 #include <string.h>
 #include <iomanip>
 #include <limits>
+#include <map>
 #include <set>
 #include <string>
 #include <vector>
@@ -3642,10 +4002,9 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the Message class.
 //
@@ -3659,12 +4018,17 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 // to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
 // program!
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 #define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 
 #include <limits>
 
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // Ensures that there is at least one operator<< in the global namespace.
 // See Message& operator<<(...) below for why.
 void operator<<(const testing::internal::Secret&, int);
@@ -3811,7 +4175,6 @@ class GTEST_API_ Message {
   std::string GetString() const;
 
  private:
-
 #if GTEST_OS_SYMBIAN
   // These are needed as the Nokia Symbian Compiler cannot decide between
   // const T& and const T* in a function template. The Nokia compiler _can_
@@ -3862,8 +4225,10 @@ std::string StreamableToString(const T& streamable) {
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-// Copyright 2005, Google Inc.
+// Copyright 2008, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -3892,21 +4257,63 @@ std::string StreamableToString(const T& streamable) {
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// Google Test filepath utilities
 //
-// This header file declares the String class and functions used internally by
-// Google Test.  They are subject to change without notice. They should not used
-// by code external to Google Test.
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
 //
-// This header file is #included by <gtest/internal/gtest-internal.h>.
-// It should not be #included by other files.
+// This file is #included in gtest/internal/gtest-internal.h.
+// Do not include this header file separately!
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+// GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifdef __BORLANDC__
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by gtest-internal.h.
+// It should not be #included by other files.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
 # include <mem.h>
 #endif
@@ -4029,48 +4436,9 @@ GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
 }  // namespace testing
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: keith.ray@gmail.com (Keith Ray)
-//
-// Google Test filepath utilities
-//
-// This header file declares classes and functions used internally by
-// Google Test.  They are subject to change without notice.
-//
-// This file is #included in <gtest/internal/gtest-internal.h>.
-// Do not include this header file separately!
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
 
 namespace testing {
 namespace internal {
@@ -4233,6 +4601,8 @@ class GTEST_API_ FilePath {
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 // This file was GENERATED by command:
 //     pump.py gtest-type-util.h.pump
@@ -4266,8 +4636,7 @@ class GTEST_API_ FilePath {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 // Type utilities needed for implementing typed and type-parameterized
 // tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
@@ -4277,6 +4646,8 @@ class GTEST_API_ FilePath {
 // Please contact googletestframework@googlegroups.com if you need
 // more.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 
@@ -4292,6 +4663,22 @@ class GTEST_API_ FilePath {
 namespace testing {
 namespace internal {
 
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
 // GetTypeName<T>() returns a human-readable name of type T.
 // NB: This function is also used in Google Mock, so don't move it inside of
 // the typed-test-only section below.
@@ -4310,7 +4697,7 @@ std::string GetTypeName() {
   char* const readable_name = __cxa_demangle(name, 0, 0, &status);
   const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
-  return name_str;
+  return CanonicalizeForStdLibVersioning(name_str);
 #  else
   return name;
 #  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
@@ -7576,6 +7963,9 @@ struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
 #define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
 
+// Stringifies its argument.
+#define GTEST_STRINGIFY_(name) #name
+
 class ProtocolMessage;
 namespace proto2 { class Message; }
 
@@ -7596,13 +7986,9 @@ ::std::string PrintToString(const T& value);
 namespace internal {
 
 struct TraceInfo;                      // Information about a trace point.
-class ScopedTrace;                     // Implements scoped trace.
 class TestInfoImpl;                    // Opaque implementation of TestInfo
 class UnitTestImpl;                    // Opaque implementation of UnitTest
 
-// How many times InitGoogleTest() has been called.
-GTEST_API_ extern int g_init_gtest_count;
-
 // The text used in failure messages to indicate the start of the
 // stack trace.
 GTEST_API_ extern const char kStackTraceMarker[];
@@ -7642,6 +8028,9 @@ GTEST_API_ std::string AppendUserMessage(
 
 #if GTEST_HAS_EXCEPTIONS
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
+/* an exported class was derived from a class that was not exported */)
+
 // This exception is thrown by (and only by) a failed Google Test
 // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
 // are enabled).  We derive it from std::runtime_error, which is for
@@ -7653,26 +8042,9 @@ class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
   explicit GoogleTestFailureException(const TestPartResult& failure);
 };
 
-#endif  // GTEST_HAS_EXCEPTIONS
-
-// A helper class for creating scoped traces in user programs.
-class GTEST_API_ ScopedTrace {
- public:
-  // The c'tor pushes the given source file location and message onto
-  // a trace stack maintained by Google Test.
-  ScopedTrace(const char* file, int line, const Message& message);
-
-  // The d'tor pops the info pushed by the c'tor.
-  //
-  // Note that the d'tor is not virtual in order to be efficient.
-  // Don't inherit from ScopedTrace!
-  ~ScopedTrace();
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
 
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
-} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
-                            // c'tor and d'tor.  Therefore it doesn't
-                            // need to be used otherwise.
+#endif  // GTEST_HAS_EXCEPTIONS
 
 namespace edit_distance {
 // Returns the optimal edits to go from 'left' to 'right'.
@@ -8004,6 +8376,14 @@ GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
 typedef void (*SetUpTestCaseFunc)();
 typedef void (*TearDownTestCaseFunc)();
 
+struct CodeLocation {
+  CodeLocation(const std::string& a_file, int a_line)
+      : file(a_file), line(a_line) {}
+
+  std::string file;
+  int line;
+};
+
 // Creates a new TestInfo object and registers it with Google Test;
 // returns the created object.
 //
@@ -8015,6 +8395,7 @@ typedef void (*TearDownTestCaseFunc)();
 //                     this is not a typed or a type-parameterized test.
 //   value_param       text representation of the test's value parameter,
 //                     or NULL if this is not a type-parameterized test.
+//   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
 //   set_up_tc:        pointer to the function that sets up the test case
 //   tear_down_tc:     pointer to the function that tears down the test case
@@ -8026,6 +8407,7 @@ GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
     const char* name,
     const char* type_param,
     const char* value_param,
+    CodeLocation code_location,
     TypeId fixture_class_id,
     SetUpTestCaseFunc set_up_tc,
     TearDownTestCaseFunc tear_down_tc,
@@ -8038,6 +8420,9 @@ GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
 
 #if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // State of the definition of a type-parameterized test case.
 class GTEST_API_ TypedTestCasePState {
  public:
@@ -8055,10 +8440,21 @@ class GTEST_API_ TypedTestCasePState {
       fflush(stderr);
       posix::Abort();
     }
-    defined_test_names_.insert(test_name);
+    registered_tests_.insert(
+        ::std::make_pair(test_name, CodeLocation(file, line)));
     return true;
   }
 
+  bool TestExists(const std::string& test_name) const {
+    return registered_tests_.count(test_name) > 0;
+  }
+
+  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+    GTEST_CHECK_(it != registered_tests_.end());
+    return it->second;
+  }
+
   // Verifies that registered_tests match the test names in
   // defined_test_names_; returns registered_tests if successful, or
   // aborts the program otherwise.
@@ -8066,10 +8462,14 @@ class GTEST_API_ TypedTestCasePState {
       const char* file, int line, const char* registered_tests);
 
  private:
+  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
   bool registered_;
-  ::std::set<const char*> defined_test_names_;
+  RegisteredTestsMap registered_tests_;
 };
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // Skips to the first non-space char after the first comma in 'str';
 // returns NULL if no comma is found in 'str'.
 inline const char* SkipComma(const char* str) {
@@ -8088,6 +8488,42 @@ inline std::string GetPrefixUntilComma(const char* str) {
   return comma == NULL ? str : std::string(str, comma);
 }
 
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest);
+
+// The default argument to the template below for the case when the user does
+// not provide a name generator.
+struct DefaultNameGenerator {
+  template <typename T>
+  static std::string GetName(int i) {
+    return StreamableToString(i);
+  }
+};
+
+template <typename Provided = DefaultNameGenerator>
+struct NameGeneratorSelector {
+  typedef Provided type;
+};
+
+template <typename NameGenerator>
+void GenerateNamesRecursively(Types0, std::vector<std::string>*, int) {}
+
+template <typename NameGenerator, typename Types>
+void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
+  result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
+  GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
+                                          i + 1);
+}
+
+template <typename NameGenerator, typename Types>
+std::vector<std::string> GenerateNames() {
+  std::vector<std::string> result;
+  GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
+  return result;
+}
+
 // TypeParameterizedTest<Fixture, TestSel, Types>::Register()
 // registers a list of type-parameterized tests with Google Test.  The
 // return value is insignificant - we just need to return something
@@ -8102,8 +8538,10 @@ class TypeParameterizedTest {
   // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
   // Types).  Valid values for 'index' are [0, N - 1] where N is the
   // length of Types.
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names, int index) {
+  static bool Register(const char* prefix, const CodeLocation& code_location,
+                       const char* case_name, const char* test_names, int index,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
     typedef typename Types::Head Type;
     typedef Fixture<Type> FixtureClass;
     typedef typename GTEST_BIND_(TestSel, Type) TestClass;
@@ -8111,19 +8549,23 @@ class TypeParameterizedTest {
     // First, registers the first type-parameterized test in the type
     // list.
     MakeAndRegisterTestInfo(
-        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
-         + StreamableToString(index)).c_str(),
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
+         "/" + type_names[index])
+            .c_str(),
         StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
         GetTypeName<Type>().c_str(),
         NULL,  // No value parameter.
-        GetTypeId<FixtureClass>(),
-        TestClass::SetUpTestCase,
-        TestClass::TearDownTestCase,
-        new TestFactoryImpl<TestClass>);
+        code_location, GetTypeId<FixtureClass>(), TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase, new TestFactoryImpl<TestClass>);
 
     // Next, recurses (at compile time) with the tail of the type list.
-    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
-        ::Register(prefix, case_name, test_names, index + 1);
+    return TypeParameterizedTest<Fixture, TestSel,
+                                 typename Types::Tail>::Register(prefix,
+                                                                 code_location,
+                                                                 case_name,
+                                                                 test_names,
+                                                                 index + 1,
+                                                                 type_names);
   }
 };
 
@@ -8131,8 +8573,11 @@ class TypeParameterizedTest {
 template <GTEST_TEMPLATE_ Fixture, class TestSel>
 class TypeParameterizedTest<Fixture, TestSel, Types0> {
  public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/, int /*index*/) {
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       int /*index*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
@@ -8144,17 +8589,35 @@ class TypeParameterizedTest<Fixture, TestSel, Types0> {
 template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
 class TypeParameterizedTestCase {
  public:
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names) {
+  static bool Register(const char* prefix, CodeLocation code_location,
+                       const TypedTestCasePState* state, const char* case_name,
+                       const char* test_names,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    std::string test_name = StripTrailingSpaces(
+        GetPrefixUntilComma(test_names));
+    if (!state->TestExists(test_name)) {
+      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+              case_name, test_name.c_str(),
+              FormatFileLocation(code_location.file.c_str(),
+                                 code_location.line).c_str());
+      fflush(stderr);
+      posix::Abort();
+    }
+    const CodeLocation& test_location = state->GetCodeLocation(test_name);
+
     typedef typename Tests::Head Head;
 
     // First, register the first test in 'Test' for each type in 'Types'.
     TypeParameterizedTest<Fixture, Head, Types>::Register(
-        prefix, case_name, test_names, 0);
+        prefix, test_location, case_name, test_names, 0, type_names);
 
     // Next, recurses (at compile time) with the tail of the test list.
-    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
-        ::Register(prefix, case_name, SkipComma(test_names));
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail,
+                                     Types>::Register(prefix, code_location,
+                                                      state, case_name,
+                                                      SkipComma(test_names),
+                                                      type_names);
   }
 };
 
@@ -8162,8 +8625,11 @@ class TypeParameterizedTestCase {
 template <GTEST_TEMPLATE_ Fixture, typename Types>
 class TypeParameterizedTestCase<Fixture, Templates0, Types> {
  public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/) {
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
+                       const TypedTestCasePState* /*state*/,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
@@ -8280,31 +8746,6 @@ struct RemoveConst<T[N]> {
 #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
     GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
 
-// Adds reference to a type if it is not a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::add_reference, which is not widely available yet.
-template <typename T>
-struct AddReference { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddReference<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper around AddReference that works when the argument T
-// depends on template parameters.
-#define GTEST_ADD_REFERENCE_(T) \
-    typename ::testing::internal::AddReference<T>::type
-
-// Adds a reference to const on top of T as necessary.  For example,
-// it transforms
-//
-//   char         ==> const char&
-//   const char   ==> const char&
-//   char&        ==> const char&
-//   const char&  ==> const char&
-//
-// The argument T must depend on some template parameters.
-#define GTEST_REFERENCE_TO_CONST_(T) \
-    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
-
 // ImplicitlyConvertible<From, To>::value is a compile-time bool
 // constant that's true iff type From can be implicitly converted to
 // type To.
@@ -8374,8 +8815,11 @@ struct IsAProtocolMessage
 // a container class by checking the type of IsContainerTest<C>(0).
 // The value of the expression is insignificant.
 //
-// Note that we look for both C::iterator and C::const_iterator.  The
-// reason is that C++ injects the name of a class as a member of the
+// In C++11 mode we check the existence of a const_iterator and that an
+// iterator is properly implemented for the container.
+//
+// For pre-C++11 that we look for both C::iterator and C::const_iterator.
+// The reason is that C++ injects the name of a class as a member of the
 // class itself (e.g. you can refer to class iterator as either
 // 'iterator' or 'iterator::iterator').  If we look for C::iterator
 // only, for example, we would mistakenly think that a class named
@@ -8385,17 +8829,96 @@ struct IsAProtocolMessage
 // IsContainerTest(typename C::const_iterator*) and
 // IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
 typedef int IsContainer;
+#if GTEST_LANG_CXX11
+template <class C,
+          class Iterator = decltype(::std::declval<const C&>().begin()),
+          class = decltype(::std::declval<const C&>().end()),
+          class = decltype(++::std::declval<Iterator&>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+IsContainer IsContainerTest(int /* dummy */) {
+  return 0;
+}
+#else
 template <class C>
 IsContainer IsContainerTest(int /* dummy */,
                             typename C::iterator* /* it */ = NULL,
                             typename C::const_iterator* /* const_it */ = NULL) {
   return 0;
 }
+#endif  // GTEST_LANG_CXX11
 
 typedef char IsNotContainer;
 template <class C>
 IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
 
+// Trait to detect whether a type T is a hash table.
+// The heuristic used is that the type contains an inner type `hasher` and does
+// not contain an inner type `reverse_iterator`.
+// If the container is iterable in reverse, then order might actually matter.
+template <typename T>
+struct IsHashTable {
+ private:
+  template <typename U>
+  static char test(typename U::hasher*, typename U::reverse_iterator*);
+  template <typename U>
+  static int test(typename U::hasher*, ...);
+  template <typename U>
+  static char test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(0, 0)) == sizeof(int);
+};
+
+template <typename T>
+const bool IsHashTable<T>::value;
+
+template<typename T>
+struct VoidT {
+    typedef void value_type;
+};
+
+template <typename T, typename = void>
+struct HasValueType : false_type {};
+template <typename T>
+struct HasValueType<T, VoidT<typename T::value_type> > : true_type {
+};
+
+template <typename C,
+          bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer),
+          bool = HasValueType<C>::value>
+struct IsRecursiveContainerImpl;
+
+template <typename C, bool HV>
+struct IsRecursiveContainerImpl<C, false, HV> : public false_type {};
+
+// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
+// obey the same inconsistencies as the IsContainerTest, namely check if
+// something is a container is relying on only const_iterator in C++11 and
+// is relying on both const_iterator and iterator otherwise
+template <typename C>
+struct IsRecursiveContainerImpl<C, true, false> : public false_type {};
+
+template <typename C>
+struct IsRecursiveContainerImpl<C, true, true> {
+  #if GTEST_LANG_CXX11
+  typedef typename IteratorTraits<typename C::const_iterator>::value_type
+      value_type;
+#else
+  typedef typename IteratorTraits<typename C::iterator>::value_type value_type;
+#endif
+  typedef is_same<value_type, C> type;
+};
+
+// IsRecursiveContainer<Type> is a unary compile-time predicate that
+// evaluates whether C is a recursive container type. A recursive container
+// type is a container type whose value_type is equal to the container type
+// itself. An example for a recursive container type is
+// boost::filesystem::path, whose iterator has a value_type that is equal to
+// boost::filesystem::path.
+template <typename C>
+struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
+
 // EnableIf<condition>::type is void when 'Cond' is true, and
 // undefined when 'Cond' is false.  To use SFINAE to make a function
 // overload only apply when a particular expression is true, add
@@ -8527,7 +9050,7 @@ class NativeArray {
  private:
   enum {
     kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
-        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value,
+        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value
   };
 
   // Initializes this object with a copy of the input.
@@ -8572,7 +9095,7 @@ class NativeArray {
 #define GTEST_SUCCESS_(message) \
   GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
 
-// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+// Suppress MSVC warning 4702 (unreachable code) for the code following
 // statement if it returns or throws (or doesn't return or throw in some
 // situations).
 #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
@@ -8683,6 +9206,7 @@ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
   ::test_info_ =\
     ::testing::internal::MakeAndRegisterTestInfo(\
         #test_case_name, #test_name, NULL, NULL, \
+        ::testing::internal::CodeLocation(__FILE__, __LINE__), \
         (parent_id), \
         parent_class::SetUpTestCase, \
         parent_class::TearDownTestCase, \
@@ -8691,7 +9215,6 @@ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
 void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-
 // Copyright 2005, Google Inc.
 // All rights reserved.
 //
@@ -8720,14 +9243,14 @@ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for death tests.  It is
 // #included by gtest.h so a user doesn't need to include this
 // directly.
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
@@ -8761,12 +9284,11 @@ void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines internal utilities needed for implementing
 // death tests.  They are subject to change without notice.
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
@@ -8786,6 +9308,9 @@ const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
 
 #if GTEST_HAS_DEATH_TEST
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // DeathTest is a class that hides much of the complexity of the
 // GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
 // returns a concrete class that depends on the prevailing death test
@@ -8869,6 +9394,8 @@ class GTEST_API_ DeathTest {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
 };
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
@@ -8951,14 +9478,18 @@ GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
 // can be streamed.
 
 // This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
-// NDEBUG mode. In this case we need the statements to be executed, the regex is
-// ignored, and the macro must accept a streamed message even though the message
-// is never printed.
-# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } else \
+// NDEBUG mode. In this case we need the statements to be executed and the macro
+// must accept a streamed message even though the message is never printed.
+// The regex object is not evaluated, but it is used to prevent "unused"
+// warnings and to avoid an expression that doesn't compile in debug mode.
+#define GTEST_EXECUTE_STATEMENT_(statement, regex)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                \
+  if (::testing::internal::AlwaysTrue()) {                     \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else if (!::testing::internal::AlwaysTrue()) {             \
+    const ::testing::internal::RE& gtest_regex = (regex);      \
+    static_cast<void>(gtest_regex);                            \
+  } else                                                       \
     ::testing::Message()
 
 // A class representing the parsed contents of the
@@ -8997,53 +9528,6 @@ class InternalRunDeathTestFlag {
 // the flag is specified; otherwise returns NULL.
 InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
 
-#else  // GTEST_HAS_DEATH_TEST
-
-// This macro is used for implementing macros such as
-// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
-// death tests are not supported. Those macros must compile on such systems
-// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
-// systems that support death tests. This allows one to write such a macro
-// on a system that does not support death tests and be sure that it will
-// compile on a death-test supporting system.
-//
-// Parameters:
-//   statement -  A statement that a macro such as EXPECT_DEATH would test
-//                for program termination. This macro has to make sure this
-//                statement is compiled but not executed, to ensure that
-//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
-//                parameter iff EXPECT_DEATH compiles with it.
-//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
-//                the output of statement.  This parameter has to be
-//                compiled but not evaluated by this macro, to ensure that
-//                this macro only accepts expressions that a macro such as
-//                EXPECT_DEATH would accept.
-//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
-//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
-//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
-//                compile inside functions where ASSERT_DEATH doesn't
-//                compile.
-//
-//  The branch that has an always false condition is used to ensure that
-//  statement and regex are compiled (and thus syntactically correct) but
-//  never executed. The unreachable code macro protects the terminator
-//  statement from generating an 'unreachable code' warning in case
-//  statement unconditionally returns or throws. The Message constructor at
-//  the end allows the syntax of streaming additional messages into the
-//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
-
 #endif  // GTEST_HAS_DEATH_TEST
 
 }  // namespace internal
@@ -9110,10 +9594,11 @@ GTEST_API_ bool InDeathTestChild();
 //
 // On the regular expressions used in death tests:
 //
+//   GOOGLETEST_CM0005 DO NOT DELETE
 //   On POSIX-compliant systems (*nix), we use the <regex.h> library,
 //   which uses the POSIX extended regex syntax.
 //
-//   On other platforms (e.g. Windows), we only support a simple regex
+//   On other platforms (e.g. Windows or Mac), we only support a simple regex
 //   syntax implemented as part of Google Test.  This limited
 //   implementation should be enough most of the time when writing
 //   death tests; though it lacks many features you can find in PCRE
@@ -9171,7 +9656,7 @@ GTEST_API_ bool InDeathTestChild();
 //   is rarely a problem as people usually don't put the test binary
 //   directory in PATH.
 //
-// TODO(wan@google.com): make thread-safe death tests search the PATH.
+// FIXME: make thread-safe death tests search the PATH.
 
 // Asserts that a given statement causes the program to exit, with an
 // integer exit status that satisfies predicate, and emitting error output
@@ -9209,9 +9694,10 @@ class GTEST_API_ ExitedWithCode {
   const int exit_code_;
 };
 
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Tests that an exit code describes an exit due to termination by a
 // given signal.
+// GOOGLETEST_CM0006 DO NOT DELETE
 class GTEST_API_ KilledBySignal {
  public:
   explicit KilledBySignal(int signum);
@@ -9283,21 +9769,69 @@ class GTEST_API_ KilledBySignal {
 # endif  // NDEBUG for EXPECT_DEBUG_DEATH
 #endif  // GTEST_HAS_DEATH_TEST
 
-// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
-// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
-// death tests are supported; otherwise they just issue a warning.  This is
-// useful when you are combining death test assertions with normal test
-// assertions in one test.
-#if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system. It is exposed publicly so that
+// systems that have death-tests with stricter requirements than
+// GTEST_HAS_DEATH_TEST can write their own equivalent of
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
     ASSERT_DEATH(statement, regex)
 #else
 # define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
 # define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
 #endif
 
 }  // namespace testing
@@ -9336,13 +9870,12 @@ class GTEST_API_ KilledBySignal {
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: vladl@google.com (Vlad Losev)
-//
 // Macros and functions for implementing parameterized tests
-// in Google C++ Testing Framework (Google Test)
+// in Google C++ Testing and Mocking Framework (Google Test)
 //
 // This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
 //
+// GOOGLETEST_CM0001 DO NOT DELETE
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
@@ -9384,7 +9917,7 @@ TEST_P(FooTest, HasBlahBlah) {
 // Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
 // case with any set of parameters you want. Google Test defines a number
 // of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a  summary of them, which
+// (surprise!) parameter generators. Here is a summary of them, which
 // are all in the testing namespace:
 //
 //
@@ -9489,9 +10022,6 @@ TEST_P(DerivedTest, DoesBlah) {
 # include <utility>
 #endif
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 // Copyright 2008 Google Inc.
 // All Rights Reserved.
 //
@@ -9520,21 +10050,22 @@ TEST_P(DerivedTest, DoesBlah) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
+
 
 // Type and function utilities for implementing parameterized tests.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 
+#include <ctype.h>
+
 #include <iterator>
+#include <set>
 #include <utility>
 #include <vector>
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 // Copyright 2003 Google Inc.
 // All rights reserved.
 //
@@ -9564,8 +10095,6 @@ TEST_P(DerivedTest, DoesBlah) {
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: Dan Egnor (egnor@google.com)
-//
 // A "smart" pointer type with reference tracking.  Every pointer to a
 // particular object is kept on a circular linked list.  When the last pointer
 // to an object is destroyed or reassigned, the object is deleted.
@@ -9599,9 +10128,11 @@ TEST_P(DerivedTest, DoesBlah) {
 //       raw pointer (e.g. via get()) concurrently, and
 //     - it's safe to write to two linked_ptrs that point to the same
 //       shared object concurrently.
-// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
+// FIXME: rename this to safe_linked_ptr to avoid
 // confusion with normal linked_ptr.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
 
@@ -9805,10 +10336,9 @@ linked_ptr<T> make_linked_ptr(T* ptr) {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
-// Google Test - The Google C++ Testing Framework
+
+// Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
 // value of any type T:
@@ -9825,6 +10355,10 @@ linked_ptr<T> make_linked_ptr(T* ptr) {
 //   2. operator<<(ostream&, const T&) defined in either foo or the
 //      global namespace.
 //
+// However if T is an STL-style container then it is printed element-wise
+// unless foo::PrintTo(const T&, ostream*) is defined. Note that
+// operator<<() is ignored for container types.
+//
 // If none of the above is defined, it will print the debug string of
 // the value if it is a protocol buffer, or print the raw bytes in the
 // value otherwise.
@@ -9871,6 +10405,8 @@ linked_ptr<T> make_linked_ptr(T* ptr) {
 // being defined as many user-defined container types don't have
 // value_type.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
@@ -9884,6 +10420,12 @@ linked_ptr<T> make_linked_ptr(T* ptr) {
 # include <tuple>
 #endif
 
+#if GTEST_HAS_ABSL
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/variant.h"
+#endif  // GTEST_HAS_ABSL
+
 namespace testing {
 
 // Definitions in the 'internal' and 'internal2' name spaces are
@@ -9902,7 +10444,11 @@ enum TypeKind {
   kProtobuf,              // a protobuf type
   kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
                           // (e.g. a named or unnamed enum type)
-  kOtherType              // anything else
+#if GTEST_HAS_ABSL
+  kConvertibleToStringView,  // a type implicitly convertible to
+                             // absl::string_view
+#endif
+  kOtherType  // anything else
 };
 
 // TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
@@ -9914,7 +10460,8 @@ class TypeWithoutFormatter {
  public:
   // This default version is called when kTypeKind is kOtherType.
   static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
+    PrintBytesInObjectTo(static_cast<const unsigned char*>(
+                             reinterpret_cast<const void*>(&value)),
                          sizeof(value), os);
   }
 };
@@ -9928,10 +10475,10 @@ template <typename T>
 class TypeWithoutFormatter<T, kProtobuf> {
  public:
   static void PrintValue(const T& value, ::std::ostream* os) {
-    const ::testing::internal::string short_str = value.ShortDebugString();
-    const ::testing::internal::string pretty_str =
-        short_str.length() <= kProtobufOneLinerMaxLength ?
-        short_str : ("\n" + value.DebugString());
+    std::string pretty_str = value.ShortDebugString();
+    if (pretty_str.length() > kProtobufOneLinerMaxLength) {
+      pretty_str = "\n" + value.DebugString();
+    }
     *os << ("<" + pretty_str + ">");
   }
 };
@@ -9952,6 +10499,19 @@ class TypeWithoutFormatter<T, kConvertibleToInteger> {
   }
 };
 
+#if GTEST_HAS_ABSL
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToStringView> {
+ public:
+  // Since T has neither operator<< nor PrintTo() but can be implicitly
+  // converted to absl::string_view, we print it as a absl::string_view.
+  //
+  // Note: the implementation is further below, as it depends on
+  // internal::PrintTo symbol which is defined later in the file.
+  static void PrintValue(const T& value, ::std::ostream* os);
+};
+#endif
+
 // Prints the given value to the given ostream.  If the value is a
 // protocol message, its debug string is printed; if it's an enum or
 // of a type implicitly convertible to BiggestInt, it's printed as an
@@ -9979,10 +10539,19 @@ class TypeWithoutFormatter<T, kConvertibleToInteger> {
 template <typename Char, typename CharTraits, typename T>
 ::std::basic_ostream<Char, CharTraits>& operator<<(
     ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T,
-      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
-       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
-       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+  TypeWithoutFormatter<T, (internal::IsAProtocolMessage<T>::value
+                               ? kProtobuf
+                               : internal::ImplicitlyConvertible<
+                                     const T&, internal::BiggestInt>::value
+                                     ? kConvertibleToInteger
+                                     :
+#if GTEST_HAS_ABSL
+                                     internal::ImplicitlyConvertible<
+                                         const T&, absl::string_view>::value
+                                         ? kConvertibleToStringView
+                                         :
+#endif
+                                         kOtherType)>::PrintValue(x, &os);
   return os;
 }
 
@@ -10031,6 +10600,103 @@ void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
 namespace testing {
 namespace internal {
 
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
 // UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
 // value to the given ostream.  The caller must ensure that
 // 'ostream_ptr' is not NULL, or the behavior is undefined.
@@ -10044,11 +10710,18 @@ class UniversalPrinter;
 template <typename T>
 void UniversalPrint(const T& value, ::std::ostream* os);
 
+enum DefaultPrinterType {
+  kPrintContainer,
+  kPrintPointer,
+  kPrintFunctionPointer,
+  kPrintOther,
+};
+template <DefaultPrinterType type> struct WrapPrinterType {};
+
 // Used to print an STL-style container when the user doesn't define
 // a PrintTo() for it.
 template <typename C>
-void DefaultPrintTo(IsContainer /* dummy */,
-                    false_type /* is not a pointer */,
+void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
                     const C& container, ::std::ostream* os) {
   const size_t kMaxCount = 32;  // The maximum number of elements to print.
   *os << '{';
@@ -10081,40 +10754,34 @@ void DefaultPrintTo(IsContainer /* dummy */,
 // implementation-defined.  Therefore they will be printed as raw
 // bytes.)
 template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    true_type /* is a pointer */,
+void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */,
                     T* p, ::std::ostream* os) {
   if (p == NULL) {
     *os << "NULL";
   } else {
-    // C++ doesn't allow casting from a function pointer to any object
-    // pointer.
-    //
-    // IsTrue() silences warnings: "Condition is always true",
-    // "unreachable code".
-    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
-      // T is not a function type.  We just call << to print p,
-      // relying on ADL to pick up user-defined << for their pointer
-      // types, if any.
-      *os << p;
-    } else {
-      // T is a function type, so '*os << p' doesn't do what we want
-      // (it just prints p as bool).  We want to print p as a const
-      // void*.  However, we cannot cast it to const void* directly,
-      // even using reinterpret_cast, as earlier versions of gcc
-      // (e.g. 3.4.5) cannot compile the cast when p is a function
-      // pointer.  Casting to UInt64 first solves the problem.
-      *os << reinterpret_cast<const void*>(
-          reinterpret_cast<internal::UInt64>(p));
-    }
+    // T is not a function type.  We just call << to print p,
+    // relying on ADL to pick up user-defined << for their pointer
+    // types, if any.
+    *os << p;
+  }
+}
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // T is a function type, so '*os << p' doesn't do what we want
+    // (it just prints p as bool).  We want to print p as a const
+    // void*.
+    *os << reinterpret_cast<const void*>(p);
   }
 }
 
 // Used to print a non-container, non-pointer value when the user
 // doesn't define PrintTo() for it.
 template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    false_type /* is not a pointer */,
+void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
                     const T& value, ::std::ostream* os) {
   ::testing_internal::DefaultPrintNonContainerTo(value, os);
 }
@@ -10132,11 +10799,8 @@ void DefaultPrintTo(IsNotContainer /* dummy */,
 // wants).
 template <typename T>
 void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first two
-  // arguments determine which version will be picked.  If T is an
-  // STL-style container, the version for container will be called; if
-  // T is a pointer, the pointer version will be called; otherwise the
-  // generic version will be called.
+  // DefaultPrintTo() is overloaded.  The type of its first argument
+  // determines which version will be picked.
   //
   // Note that we check for container types here, prior to we check
   // for protocol message types in our operator<<.  The rationale is:
@@ -10148,13 +10812,27 @@ void PrintTo(const T& value, ::std::ostream* os) {
   // elements; therefore we check for container types here to ensure
   // that our format is used.
   //
-  // The second argument of DefaultPrintTo() is needed to bypass a bug
-  // in Symbian's C++ compiler that prevents it from picking the right
-  // overload between:
-  //
-  //   PrintTo(const T& x, ...);
-  //   PrintTo(T* x, ...);
-  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+  // Note that MSVC and clang-cl do allow an implicit conversion from
+  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
+  // So don't use ImplicitlyConvertible if it can be helped since it will
+  // cause this warning, and use a separate overload of DefaultPrintTo for
+  // function pointers so that the `*os << p` in the object pointer overload
+  // doesn't cause that warning either.
+  DefaultPrintTo(
+      WrapPrinterType <
+                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+              !IsRecursiveContainer<T>::value
+          ? kPrintContainer
+          : !is_pointer<T>::value
+                ? kPrintOther
+#if GTEST_LANG_CXX11
+                : std::is_function<typename std::remove_pointer<T>::type>::value
+#else
+                : !internal::ImplicitlyConvertible<T, const void*>::value
+#endif
+                      ? kPrintFunctionPointer
+                      : kPrintPointer > (),
+      value, os);
 }
 
 // The following list of PrintTo() overloads tells
@@ -10261,6 +10939,17 @@ inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
 }
 #endif  // GTEST_HAS_STD_WSTRING
 
+#if GTEST_HAS_ABSL
+// Overload for absl::string_view.
+inline void PrintTo(absl::string_view sp, ::std::ostream* os) {
+  PrintTo(::std::string(sp), os);
+}
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_LANG_CXX11
+inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
+#endif  // GTEST_LANG_CXX11
+
 #if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
 // Helper function for printing a tuple.  T must be instantiated with
 // a tuple type.
@@ -10390,6 +11079,48 @@ class UniversalPrinter {
   GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
+#if GTEST_HAS_ABSL
+
+// Printer for absl::optional
+
+template <typename T>
+class UniversalPrinter<::absl::optional<T>> {
+ public:
+  static void Print(const ::absl::optional<T>& value, ::std::ostream* os) {
+    *os << '(';
+    if (!value) {
+      *os << "nullopt";
+    } else {
+      UniversalPrint(*value, os);
+    }
+    *os << ')';
+  }
+};
+
+// Printer for absl::variant
+
+template <typename... T>
+class UniversalPrinter<::absl::variant<T...>> {
+ public:
+  static void Print(const ::absl::variant<T...>& value, ::std::ostream* os) {
+    *os << '(';
+    absl::visit(Visitor{os}, value);
+    *os << ')';
+  }
+
+ private:
+  struct Visitor {
+    template <typename U>
+    void operator()(const U& u) const {
+      *os << "'" << GetTypeName<U>() << "' with value ";
+      UniversalPrint(u, os);
+    }
+    ::std::ostream* os;
+  };
+};
+
+#endif  // GTEST_HAS_ABSL
+
 // UniversalPrintArray(begin, len, os) prints an array of 'len'
 // elements, starting at address 'begin'.
 template <typename T>
@@ -10403,7 +11134,7 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
     // If the array has more than kThreshold elements, we'll have to
     // omit some details by printing only the first and the last
     // kChunkSize elements.
-    // TODO(wan@google.com): let the user control the threshold using a flag.
+    // FIXME: let the user control the threshold using a flag.
     if (len <= kThreshold) {
       PrintRawArrayTo(begin, len, os);
     } else {
@@ -10485,7 +11216,7 @@ class UniversalTersePrinter<const char*> {
     if (str == NULL) {
       *os << "NULL";
     } else {
-      UniversalPrint(string(str), os);
+      UniversalPrint(std::string(str), os);
     }
   }
 };
@@ -10536,7 +11267,7 @@ void UniversalPrint(const T& value, ::std::ostream* os) {
   UniversalPrinter<T1>::Print(value, os);
 }
 
-typedef ::std::vector<string> Strings;
+typedef ::std::vector< ::std::string> Strings;
 
 // TuplePolicy<TupleT> must provide:
 // - tuple_size
@@ -10555,12 +11286,13 @@ struct TuplePolicy {
   static const size_t tuple_size = ::std::tr1::tuple_size<Tuple>::value;
 
   template <size_t I>
-  struct tuple_element : ::std::tr1::tuple_element<I, Tuple> {};
+  struct tuple_element : ::std::tr1::tuple_element<static_cast<int>(I), Tuple> {
+  };
 
   template <size_t I>
-  static typename AddReference<
-      const typename ::std::tr1::tuple_element<I, Tuple>::type>::type get(
-      const Tuple& tuple) {
+  static typename AddReference<const typename ::std::tr1::tuple_element<
+      static_cast<int>(I), Tuple>::type>::type
+  get(const Tuple& tuple) {
     return ::std::tr1::get<I>(tuple);
   }
 };
@@ -10656,6 +11388,16 @@ Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
 
 }  // namespace internal
 
+#if GTEST_HAS_ABSL
+namespace internal2 {
+template <typename T>
+void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
+    const T& value, ::std::ostream* os) {
+  internal::PrintTo(absl::string_view(value), os);
+}
+}  // namespace internal2
+#endif
+
 template <typename T>
 ::std::string PrintToString(const T& value) {
   ::std::stringstream ss;
@@ -10665,21 +11407,86 @@ ::std::string PrintToString(const T& value) {
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
 
-#if GTEST_HAS_PARAM_TEST
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
 namespace testing {
-namespace internal {
 
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+  TestParamInfo(const ParamType& a_param, size_t an_index) :
+    param(a_param),
+    index(an_index) {}
+  ParamType param;
+  size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+  template <class ParamType>
+  std::string operator()(const TestParamInfo<ParamType>& info) const {
+    return PrintToString(info.param);
+  }
+};
+
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
 // Outputs a message explaining invalid registration of different
 // fixture class for the same test case. This may happen when
 // TEST_P macro is used to define two tests with the same name
 // but in different namespaces.
 GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
-                                          const char* file, int line);
+                                          CodeLocation code_location);
 
 template <typename> class ParamGeneratorInterface;
 template <typename> class ParamGenerator;
@@ -10827,7 +11634,7 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
       return base_;
     }
     virtual void Advance() {
-      value_ = value_ + step_;
+      value_ = static_cast<T>(value_ + step_);
       index_++;
     }
     virtual ParamIteratorInterface<T>* Clone() const {
@@ -10864,7 +11671,7 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
                                const T& end,
                                const IncrementT& step) {
     int end_index = 0;
-    for (T i = begin; i < end; i = i + step)
+    for (T i = begin; i < end; i = static_cast<T>(i + step))
       end_index++;
     return end_index;
   }
@@ -10966,6 +11773,37 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
   const ContainerType container_;
 };  // class ValuesInIteratorRangeGenerator
 
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+  Message name_stream;
+  name_stream << info.index;
+  return name_stream.GetString();
+}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Parameterized test name overload helpers, which help the
+// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized
+// test name generator and user param name generator.
+template <class ParamType, class ParamNameGenFunctor>
+ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) {
+  return func;
+}
+
+template <class ParamType>
+struct ParamNameGenFunc {
+  typedef std::string Type(const TestParamInfo<ParamType>&);
+};
+
+template <class ParamType>
+typename ParamNameGenFunc<ParamType>::Type *GetParamNameGen() {
+  return DefaultParamName;
+}
+
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Stores a parameter value and later creates tests parameterized with that
@@ -11038,7 +11876,7 @@ class ParameterizedTestCaseInfoBase {
   virtual ~ParameterizedTestCaseInfoBase() {}
 
   // Base part of test case name for display purposes.
-  virtual const string& GetTestCaseName() const = 0;
+  virtual const std::string& GetTestCaseName() const = 0;
   // Test case id to verify identity.
   virtual TypeId GetTestCaseTypeId() const = 0;
   // UnitTest class invokes this method to register tests in this
@@ -11070,12 +11908,14 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
   typedef typename TestCase::ParamType ParamType;
   // A function that returns an instance of appropriate generator type.
   typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+  typedef typename ParamNameGenFunc<ParamType>::Type ParamNameGeneratorFunc;
 
-  explicit ParameterizedTestCaseInfo(const char* name)
-      : test_case_name_(name) {}
+  explicit ParameterizedTestCaseInfo(
+      const char* name, CodeLocation code_location)
+      : test_case_name_(name), code_location_(code_location) {}
 
   // Test case base name for display purposes.
-  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  virtual const std::string& GetTestCaseName() const { return test_case_name_; }
   // Test case id to verify identity.
   virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
   // TEST_P macro uses AddTestPattern() to record information
@@ -11093,11 +11933,12 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
   }
   // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
   // about a generator.
-  int AddTestCaseInstantiation(const string& instantiation_name,
+  int AddTestCaseInstantiation(const std::string& instantiation_name,
                                GeneratorCreationFunc* func,
-                               const char* /* file */,
-                               int /* line */) {
-    instantiations_.push_back(::std::make_pair(instantiation_name, func));
+                               ParamNameGeneratorFunc* name_func,
+                               const char* file, int line) {
+    instantiations_.push_back(
+        InstantiationInfo(instantiation_name, func, name_func, file, line));
     return 0;  // Return value used only to run this method in namespace scope.
   }
   // UnitTest class invokes this method to register tests in this test case
@@ -11112,25 +11953,45 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
       for (typename InstantiationContainer::iterator gen_it =
                instantiations_.begin(); gen_it != instantiations_.end();
                ++gen_it) {
-        const string& instantiation_name = gen_it->first;
-        ParamGenerator<ParamType> generator((*gen_it->second)());
+        const std::string& instantiation_name = gen_it->name;
+        ParamGenerator<ParamType> generator((*gen_it->generator)());
+        ParamNameGeneratorFunc* name_func = gen_it->name_func;
+        const char* file = gen_it->file;
+        int line = gen_it->line;
 
-        string test_case_name;
+        std::string test_case_name;
         if ( !instantiation_name.empty() )
           test_case_name = instantiation_name + "/";
         test_case_name += test_info->test_case_base_name;
 
-        int i = 0;
+        size_t i = 0;
+        std::set<std::string> test_param_names;
         for (typename ParamGenerator<ParamType>::iterator param_it =
                  generator.begin();
              param_it != generator.end(); ++param_it, ++i) {
           Message test_name_stream;
-          test_name_stream << test_info->test_base_name << "/" << i;
+
+          std::string param_name = name_func(
+              TestParamInfo<ParamType>(*param_it, i));
+
+          GTEST_CHECK_(IsValidParamName(param_name))
+              << "Parameterized test name '" << param_name
+              << "' is invalid, in " << file
+              << " line " << line << std::endl;
+
+          GTEST_CHECK_(test_param_names.count(param_name) == 0)
+              << "Duplicate parameterized test name '" << param_name
+              << "', in " << file << " line " << line << std::endl;
+
+          test_param_names.insert(param_name);
+
+          test_name_stream << test_info->test_base_name << "/" << param_name;
           MakeAndRegisterTestInfo(
               test_case_name.c_str(),
               test_name_stream.GetString().c_str(),
               NULL,  // No type parameter.
               PrintToString(*param_it).c_str(),
+              code_location_,
               GetTestCaseTypeId(),
               TestCase::SetUpTestCase,
               TestCase::TearDownTestCase,
@@ -11151,17 +12012,50 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
         test_base_name(a_test_base_name),
         test_meta_factory(a_test_meta_factory) {}
 
-    const string test_case_base_name;
-    const string test_base_name;
+    const std::string test_case_base_name;
+    const std::string test_base_name;
     const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
   };
   typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
-  // Keeps pairs of <Instantiation name, Sequence generator creation function>
-  // received from INSTANTIATE_TEST_CASE_P macros.
-  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
-      InstantiationContainer;
+  // Records data received from INSTANTIATE_TEST_CASE_P macros:
+  //  <Instantiation name, Sequence generator creation function,
+  //     Name generator function, Source file, Source line>
+  struct InstantiationInfo {
+      InstantiationInfo(const std::string &name_in,
+                        GeneratorCreationFunc* generator_in,
+                        ParamNameGeneratorFunc* name_func_in,
+                        const char* file_in,
+                        int line_in)
+          : name(name_in),
+            generator(generator_in),
+            name_func(name_func_in),
+            file(file_in),
+            line(line_in) {}
+
+      std::string name;
+      GeneratorCreationFunc* generator;
+      ParamNameGeneratorFunc* name_func;
+      const char* file;
+      int line;
+  };
+  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+  static bool IsValidParamName(const std::string& name) {
+    // Check for empty string
+    if (name.empty())
+      return false;
+
+    // Check for invalid characters
+    for (std::string::size_type index = 0; index < name.size(); ++index) {
+      if (!isalnum(name[index]) && name[index] != '_')
+        return false;
+    }
+
+    return true;
+  }
 
-  const string test_case_name_;
+  const std::string test_case_name_;
+  CodeLocation code_location_;
   TestInfoContainer tests_;
   InstantiationContainer instantiations_;
 
@@ -11189,8 +12083,7 @@ class ParameterizedTestCaseRegistry {
   template <class TestCase>
   ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
       const char* test_case_name,
-      const char* file,
-      int line) {
+      CodeLocation code_location) {
     ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
     for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
          it != test_case_infos_.end(); ++it) {
@@ -11199,7 +12092,7 @@ class ParameterizedTestCaseRegistry {
           // Complain about incorrect usage of Google Test facilities
           // and terminate the program since we cannot guaranty correct
           // test case setup and tear-down in this case.
-          ReportInvalidTestCaseType(test_case_name,  file, line);
+          ReportInvalidTestCaseType(test_case_name, code_location);
           posix::Abort();
         } else {
           // At this point we are sure that the object we found is of the same
@@ -11212,7 +12105,8 @@ class ParameterizedTestCaseRegistry {
       }
     }
     if (typed_test_info == NULL) {
-      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(
+          test_case_name, code_location);
       test_case_infos_.push_back(typed_test_info);
     }
     return typed_test_info;
@@ -11235,8 +12129,6 @@ class ParameterizedTestCaseRegistry {
 }  // namespace internal
 }  // namespace testing
 
-#endif  //  GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 // This file was GENERATED by command:
 //     pump.py gtest-param-util-generated.h.pump
@@ -11270,8 +12162,7 @@ class ParameterizedTestCaseRegistry {
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
+
 
 // Type and function utilities for implementing parameterized tests.
 // This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
@@ -11283,14 +12174,11 @@ class ParameterizedTestCaseRegistry {
 // by the maximum arity of the implementation of tuple which is
 // currently set at 10.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-
-#if GTEST_HAS_PARAM_TEST
 
 namespace testing {
 
@@ -11317,7 +12205,12 @@ class ValueArray1 {
   explicit ValueArray1(T1 v1) : v1_(v1) {}
 
   template <typename T>
-  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_)};
+    return ValuesIn(array);
+  }
+
+  ValueArray1(const ValueArray1& other) : v1_(other.v1_) {}
 
  private:
   // No implementation - assignment is unsupported.
@@ -11337,6 +12230,8 @@ class ValueArray2 {
     return ValuesIn(array);
   }
 
+  ValueArray2(const ValueArray2& other) : v1_(other.v1_), v2_(other.v2_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray2& other);
@@ -11357,6 +12252,9 @@ class ValueArray3 {
     return ValuesIn(array);
   }
 
+  ValueArray3(const ValueArray3& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray3& other);
@@ -11379,6 +12277,9 @@ class ValueArray4 {
     return ValuesIn(array);
   }
 
+  ValueArray4(const ValueArray4& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray4& other);
@@ -11402,6 +12303,9 @@ class ValueArray5 {
     return ValuesIn(array);
   }
 
+  ValueArray5(const ValueArray5& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray5& other);
@@ -11428,6 +12332,9 @@ class ValueArray6 {
     return ValuesIn(array);
   }
 
+  ValueArray6(const ValueArray6& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray6& other);
@@ -11455,6 +12362,10 @@ class ValueArray7 {
     return ValuesIn(array);
   }
 
+  ValueArray7(const ValueArray7& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray7& other);
@@ -11484,6 +12395,10 @@ class ValueArray8 {
     return ValuesIn(array);
   }
 
+  ValueArray8(const ValueArray8& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray8& other);
@@ -11515,6 +12430,10 @@ class ValueArray9 {
     return ValuesIn(array);
   }
 
+  ValueArray9(const ValueArray9& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray9& other);
@@ -11547,6 +12466,10 @@ class ValueArray10 {
     return ValuesIn(array);
   }
 
+  ValueArray10(const ValueArray10& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray10& other);
@@ -11581,6 +12504,11 @@ class ValueArray11 {
     return ValuesIn(array);
   }
 
+  ValueArray11(const ValueArray11& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray11& other);
@@ -11617,6 +12545,11 @@ class ValueArray12 {
     return ValuesIn(array);
   }
 
+  ValueArray12(const ValueArray12& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray12& other);
@@ -11655,6 +12588,11 @@ class ValueArray13 {
     return ValuesIn(array);
   }
 
+  ValueArray13(const ValueArray13& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray13& other);
@@ -11694,6 +12632,11 @@ class ValueArray14 {
     return ValuesIn(array);
   }
 
+  ValueArray14(const ValueArray14& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray14& other);
@@ -11735,6 +12678,12 @@ class ValueArray15 {
     return ValuesIn(array);
   }
 
+  ValueArray15(const ValueArray15& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray15& other);
@@ -11779,6 +12728,12 @@ class ValueArray16 {
     return ValuesIn(array);
   }
 
+  ValueArray16(const ValueArray16& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray16& other);
@@ -11824,6 +12779,12 @@ class ValueArray17 {
     return ValuesIn(array);
   }
 
+  ValueArray17(const ValueArray17& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray17& other);
@@ -11871,6 +12832,12 @@ class ValueArray18 {
     return ValuesIn(array);
   }
 
+  ValueArray18(const ValueArray18& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray18& other);
@@ -11919,6 +12886,13 @@ class ValueArray19 {
     return ValuesIn(array);
   }
 
+  ValueArray19(const ValueArray19& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray19& other);
@@ -11969,6 +12943,13 @@ class ValueArray20 {
     return ValuesIn(array);
   }
 
+  ValueArray20(const ValueArray20& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray20& other);
@@ -12022,6 +13003,13 @@ class ValueArray21 {
     return ValuesIn(array);
   }
 
+  ValueArray21(const ValueArray21& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray21& other);
@@ -12076,6 +13064,13 @@ class ValueArray22 {
     return ValuesIn(array);
   }
 
+  ValueArray22(const ValueArray22& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray22& other);
@@ -12132,6 +13127,14 @@ class ValueArray23 {
     return ValuesIn(array);
   }
 
+  ValueArray23(const ValueArray23& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray23& other);
@@ -12190,6 +13193,14 @@ class ValueArray24 {
     return ValuesIn(array);
   }
 
+  ValueArray24(const ValueArray24& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray24& other);
@@ -12249,6 +13260,14 @@ class ValueArray25 {
     return ValuesIn(array);
   }
 
+  ValueArray25(const ValueArray25& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray25& other);
@@ -12310,6 +13329,14 @@ class ValueArray26 {
     return ValuesIn(array);
   }
 
+  ValueArray26(const ValueArray26& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray26& other);
@@ -12374,6 +13401,15 @@ class ValueArray27 {
     return ValuesIn(array);
   }
 
+  ValueArray27(const ValueArray27& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray27& other);
@@ -12439,6 +13475,15 @@ class ValueArray28 {
     return ValuesIn(array);
   }
 
+  ValueArray28(const ValueArray28& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray28& other);
@@ -12505,6 +13550,15 @@ class ValueArray29 {
     return ValuesIn(array);
   }
 
+  ValueArray29(const ValueArray29& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray29& other);
@@ -12574,6 +13628,15 @@ class ValueArray30 {
     return ValuesIn(array);
   }
 
+  ValueArray30(const ValueArray30& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray30& other);
@@ -12645,6 +13708,16 @@ class ValueArray31 {
     return ValuesIn(array);
   }
 
+  ValueArray31(const ValueArray31& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray31& other);
@@ -12717,6 +13790,16 @@ class ValueArray32 {
     return ValuesIn(array);
   }
 
+  ValueArray32(const ValueArray32& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray32& other);
@@ -12792,6 +13875,16 @@ class ValueArray33 {
     return ValuesIn(array);
   }
 
+  ValueArray33(const ValueArray33& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray33& other);
@@ -12868,6 +13961,16 @@ class ValueArray34 {
     return ValuesIn(array);
   }
 
+  ValueArray34(const ValueArray34& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray34& other);
@@ -12945,6 +14048,17 @@ class ValueArray35 {
     return ValuesIn(array);
   }
 
+  ValueArray35(const ValueArray35& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray35& other);
@@ -13025,6 +14139,17 @@ class ValueArray36 {
     return ValuesIn(array);
   }
 
+  ValueArray36(const ValueArray36& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray36& other);
@@ -13107,6 +14232,17 @@ class ValueArray37 {
     return ValuesIn(array);
   }
 
+  ValueArray37(const ValueArray37& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray37& other);
@@ -13190,6 +14326,17 @@ class ValueArray38 {
     return ValuesIn(array);
   }
 
+  ValueArray38(const ValueArray38& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray38& other);
@@ -13275,6 +14422,18 @@ class ValueArray39 {
     return ValuesIn(array);
   }
 
+  ValueArray39(const ValueArray39& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray39& other);
@@ -13362,6 +14521,18 @@ class ValueArray40 {
     return ValuesIn(array);
   }
 
+  ValueArray40(const ValueArray40& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray40& other);
@@ -13451,6 +14622,18 @@ class ValueArray41 {
     return ValuesIn(array);
   }
 
+  ValueArray41(const ValueArray41& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray41& other);
@@ -13542,6 +14725,18 @@ class ValueArray42 {
     return ValuesIn(array);
   }
 
+  ValueArray42(const ValueArray42& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray42& other);
@@ -13634,6 +14829,19 @@ class ValueArray43 {
     return ValuesIn(array);
   }
 
+  ValueArray43(const ValueArray43& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray43& other);
@@ -13728,6 +14936,19 @@ class ValueArray44 {
     return ValuesIn(array);
   }
 
+  ValueArray44(const ValueArray44& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray44& other);
@@ -13824,6 +15045,19 @@ class ValueArray45 {
     return ValuesIn(array);
   }
 
+  ValueArray45(const ValueArray45& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray45& other);
@@ -13922,6 +15156,19 @@ class ValueArray46 {
     return ValuesIn(array);
   }
 
+  ValueArray46(const ValueArray46& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray46& other);
@@ -14022,6 +15269,20 @@ class ValueArray47 {
     return ValuesIn(array);
   }
 
+  ValueArray47(const ValueArray47& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray47& other);
@@ -14124,6 +15385,20 @@ class ValueArray48 {
     return ValuesIn(array);
   }
 
+  ValueArray48(const ValueArray48& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray48& other);
@@ -14227,6 +15502,20 @@ class ValueArray49 {
     return ValuesIn(array);
   }
 
+  ValueArray49(const ValueArray49& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_), v49_(other.v49_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray49& other);
@@ -14331,6 +15620,20 @@ class ValueArray50 {
     return ValuesIn(array);
   }
 
+  ValueArray50(const ValueArray50& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_), v49_(other.v49_), v50_(other.v50_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray50& other);
@@ -14443,7 +15746,7 @@ class CartesianProductGenerator2
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -14475,7 +15778,7 @@ class CartesianProductGenerator2
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_);
+        current_value_.reset(new ParamType(*current1_, *current2_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -14497,7 +15800,7 @@ class CartesianProductGenerator2
     const typename ParamGenerator<T2>::iterator begin2_;
     const typename ParamGenerator<T2>::iterator end2_;
     typename ParamGenerator<T2>::iterator current2_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator2::Iterator
 
   // No implementation - assignment is unsupported.
@@ -14566,7 +15869,7 @@ class CartesianProductGenerator3
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -14602,7 +15905,7 @@ class CartesianProductGenerator3
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -14628,7 +15931,7 @@ class CartesianProductGenerator3
     const typename ParamGenerator<T3>::iterator begin3_;
     const typename ParamGenerator<T3>::iterator end3_;
     typename ParamGenerator<T3>::iterator current3_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator3::Iterator
 
   // No implementation - assignment is unsupported.
@@ -14707,7 +16010,7 @@ class CartesianProductGenerator4
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -14747,8 +16050,8 @@ class CartesianProductGenerator4
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -14778,7 +16081,7 @@ class CartesianProductGenerator4
     const typename ParamGenerator<T4>::iterator begin4_;
     const typename ParamGenerator<T4>::iterator end4_;
     typename ParamGenerator<T4>::iterator current4_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator4::Iterator
 
   // No implementation - assignment is unsupported.
@@ -14865,7 +16168,7 @@ class CartesianProductGenerator5
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -14909,8 +16212,8 @@ class CartesianProductGenerator5
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -14944,7 +16247,7 @@ class CartesianProductGenerator5
     const typename ParamGenerator<T5>::iterator begin5_;
     const typename ParamGenerator<T5>::iterator end5_;
     typename ParamGenerator<T5>::iterator current5_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator5::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15042,7 +16345,7 @@ class CartesianProductGenerator6
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15090,8 +16393,8 @@ class CartesianProductGenerator6
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -15129,7 +16432,7 @@ class CartesianProductGenerator6
     const typename ParamGenerator<T6>::iterator begin6_;
     const typename ParamGenerator<T6>::iterator end6_;
     typename ParamGenerator<T6>::iterator current6_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator6::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15236,7 +16539,7 @@ class CartesianProductGenerator7
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15288,8 +16591,8 @@ class CartesianProductGenerator7
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -15331,7 +16634,7 @@ class CartesianProductGenerator7
     const typename ParamGenerator<T7>::iterator begin7_;
     const typename ParamGenerator<T7>::iterator end7_;
     typename ParamGenerator<T7>::iterator current7_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator7::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15449,7 +16752,7 @@ class CartesianProductGenerator8
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15505,8 +16808,8 @@ class CartesianProductGenerator8
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -15552,7 +16855,7 @@ class CartesianProductGenerator8
     const typename ParamGenerator<T8>::iterator begin8_;
     const typename ParamGenerator<T8>::iterator end8_;
     typename ParamGenerator<T8>::iterator current8_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator8::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15678,7 +16981,7 @@ class CartesianProductGenerator9
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15738,9 +17041,9 @@ class CartesianProductGenerator9
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
             *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_);
+            *current9_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -15790,7 +17093,7 @@ class CartesianProductGenerator9
     const typename ParamGenerator<T9>::iterator begin9_;
     const typename ParamGenerator<T9>::iterator end9_;
     typename ParamGenerator<T9>::iterator current9_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator9::Iterator
 
   // No implementation - assignment is unsupported.
@@ -15925,7 +17228,7 @@ class CartesianProductGenerator10
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -15989,9 +17292,9 @@ class CartesianProductGenerator10
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
             *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_, *current10_);
+            *current9_, *current10_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -16045,7 +17348,7 @@ class CartesianProductGenerator10
     const typename ParamGenerator<T10>::iterator begin10_;
     const typename ParamGenerator<T10>::iterator end10_;
     typename ParamGenerator<T10>::iterator current10_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator10::Iterator
 
   // No implementation - assignment is unsupported.
@@ -16376,12 +17679,8 @@ CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
 }  // namespace internal
 }  // namespace testing
 
-#endif  //  GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 
-#if GTEST_HAS_PARAM_TEST
-
 namespace testing {
 
 // Functions producing parameter generators.
@@ -16461,7 +17760,7 @@ internal::ParamGenerator<T> Range(T start, T end) {
 // each with C-string values of "foo", "bar", and "baz":
 //
 // const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings));
 //
 // This instantiates tests from test case StlStringTest
 // each with STL strings with values "a" and "b":
@@ -17563,8 +18862,6 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 }
 # endif  // GTEST_HAS_COMBINE
 
-
-
 # define TEST_P(test_case_name, test_name) \
   class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
       : public test_case_name { \
@@ -17575,11 +18872,14 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
     static int AddToRegistry() { \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
-                  #test_case_name, \
-                  #test_name, \
-                  new ::testing::internal::TestMetaFactory< \
-                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestPattern(\
+                      GTEST_STRINGIFY_(test_case_name), \
+                      GTEST_STRINGIFY_(test_name), \
+                      new ::testing::internal::TestMetaFactory< \
+                          GTEST_TEST_CLASS_NAME_(\
+                              test_case_name, test_name)>()); \
       return 0; \
     } \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
@@ -17591,21 +18891,39 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
   void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
-# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
-  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
+// to specify a function or functor that generates custom test name suffixes
+// based on the test parameters. The function should accept one argument of
+// type testing::TestParamInfo<class ParamType>, and return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()).
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
+  static ::testing::internal::ParamGenerator<test_case_name::ParamType> \
       gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  int gtest_##prefix##test_case_name##_dummy_ = \
+  static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
+    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
+        (__VA_ARGS__)(info); \
+  } \
+  static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
-                  #prefix, \
-                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
-                  __FILE__, __LINE__)
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
+                      #prefix, \
+                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
+                      __FILE__, __LINE__)
 
 }  // namespace testing
 
-#endif  // GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 // Copyright 2006, Google Inc.
 // All rights reserved.
@@ -17635,10 +18953,10 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Google C++ Testing Framework definitions useful in production code.
+// Google C++ Testing and Mocking Framework definitions useful in production code.
+// GOOGLETEST_CM0003 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
@@ -17649,17 +18967,20 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 //
 // class MyClass {
 //  private:
-//   void MyMethod();
-//   FRIEND_TEST(MyClassTest, MyMethod);
+//   void PrivateMethod();
+//   FRIEND_TEST(MyClassTest, PrivateMethodWorks);
 // };
 //
 // class MyClassTest : public testing::Test {
 //   // ...
 // };
 //
-// TEST_F(MyClassTest, MyMethod) {
-//   // Can call MyClass::MyMethod() here.
+// TEST_F(MyClassTest, PrivateMethodWorks) {
+//   // Can call MyClass::PrivateMethod() here.
 // }
+//
+// Note: The test class must be in the same namespace as the class being tested.
+// For example, putting MyClassTest in an anonymous namespace will not work.
 
 #define FRIEND_TEST(test_case_name, test_name)\
 friend class test_case_name##_##test_name##_Test
@@ -17694,8 +19015,7 @@ friend class test_case_name##_##test_name##_Test
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: mheule@google.com (Markus Heule)
-//
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 #define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
@@ -17703,6 +19023,9 @@ friend class test_case_name##_##test_name##_Test
 #include <iosfwd>
 #include <vector>
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // A copyable object representing the result of a test part (i.e. an
@@ -17808,7 +19131,7 @@ class GTEST_API_ TestPartResultArray {
 };
 
 // This interface knows how to report a test part result.
-class TestPartResultReporterInterface {
+class GTEST_API_ TestPartResultReporterInterface {
  public:
   virtual ~TestPartResultReporterInterface() {}
 
@@ -17841,6 +19164,8 @@ class GTEST_API_ HasNewFatalFailureHelper
 
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 // Copyright 2008 Google Inc.
 // All Rights Reserved.
@@ -17870,8 +19195,9 @@ class GTEST_API_ HasNewFatalFailureHelper
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
+
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
@@ -17926,6 +19252,24 @@ TYPED_TEST(FooTest, DoesBlah) {
 
 TYPED_TEST(FooTest, HasPropertyA) { ... }
 
+// TYPED_TEST_CASE takes an optional third argument which allows to specify a
+// class that generates custom test name suffixes based on the type. This should
+// be a class which has a static template function GetName(int index) returning
+// a string for each type. The provided integer index equals the index of the
+// type in the provided type list. In many cases the index can be ignored.
+//
+// For example:
+//   class MyTypeNames {
+//    public:
+//     template <typename T>
+//     static std::string GetName(int) {
+//       if (std::is_same<T, char>()) return "char";
+//       if (std::is_same<T, int>()) return "int";
+//       if (std::is_same<T, unsigned int>()) return "unsignedInt";
+//     }
+//   };
+//   TYPED_TEST_CASE(FooTest, MyTypes, MyTypeNames);
+
 #endif  // 0
 
 // Type-parameterized tests are abstract test patterns parameterized
@@ -17987,6 +19331,11 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
 // If the type list contains only one type, you can write that type
 // directly without Types<...>:
 //   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+//
+// Similar to the optional argument of TYPED_TEST_CASE above,
+// INSTANTIATE_TEST_CASE_P takes an optional fourth argument which allows to
+// generate custom names.
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes, MyTypeNames);
 
 #endif  // 0
 
@@ -18001,31 +19350,46 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
 // given test case.
 # define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
 
+// Expands to the name of the typedef for the NameGenerator, responsible for
+// creating the suffixes of the name.
+#define GTEST_NAME_GENERATOR_(TestCaseName) \
+  gtest_type_params_##TestCaseName##_NameGenerator
+
 // The 'Types' template argument below must have spaces around it
 // since some compilers may choke on '>>' when passing a template
 // instance (e.g. Types<int>)
-# define TYPED_TEST_CASE(CaseName, Types) \
-  typedef ::testing::internal::TypeList< Types >::type \
-      GTEST_TYPE_PARAMS_(CaseName)
-
-# define TYPED_TEST(CaseName, TestName) \
-  template <typename gtest_TypeParam_> \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
-      : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTest< \
-          CaseName, \
-          ::testing::internal::TemplateSel< \
-              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
-          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
-              "", #CaseName, #TestName, 0); \
-  template <typename gtest_TypeParam_> \
-  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+# define TYPED_TEST_CASE(CaseName, Types, ...)                             \
+  typedef ::testing::internal::TypeList< Types >::type GTEST_TYPE_PARAMS_( \
+      CaseName);                                                           \
+  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type    \
+      GTEST_NAME_GENERATOR_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName)                                       \
+  template <typename gtest_TypeParam_>                                        \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
+      : public CaseName<gtest_TypeParam_> {                                   \
+   private:                                                                   \
+    typedef CaseName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                       \
+    virtual void TestBody();                                                  \
+  };                                                                          \
+  static bool gtest_##CaseName##_##TestName##_registered_                     \
+        GTEST_ATTRIBUTE_UNUSED_ =                                             \
+      ::testing::internal::TypeParameterizedTest<                             \
+          CaseName,                                                           \
+          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
+                                                                  TestName)>, \
+          GTEST_TYPE_PARAMS_(                                                 \
+              CaseName)>::Register("",                                        \
+                                   ::testing::internal::CodeLocation(         \
+                                       __FILE__, __LINE__),                   \
+                                   #CaseName, #TestName, 0,                   \
+                                   ::testing::internal::GenerateNames<        \
+                                       GTEST_NAME_GENERATOR_(CaseName),       \
+                                       GTEST_TYPE_PARAMS_(CaseName)>());      \
+  template <typename gtest_TypeParam_>                                        \
+  void GTEST_TEST_CLASS_NAME_(CaseName,                                       \
+                              TestName)<gtest_TypeParam_>::TestBody()
 
 #endif  // GTEST_HAS_TYPED_TEST
 
@@ -18082,24 +19446,35 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
   namespace GTEST_CASE_NAMESPACE_(CaseName) { \
   typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
   } \
-  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
-          __FILE__, __LINE__, #__VA_ARGS__)
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) \
+      GTEST_ATTRIBUTE_UNUSED_ = \
+          GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames( \
+              __FILE__, __LINE__, #__VA_ARGS__)
 
 // The 'Types' template argument below must have spaces around it
 // since some compilers may choke on '>>' when passing a template
 // instance (e.g. Types<int>)
-# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
-  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTestCase<CaseName, \
-          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
-          ::testing::internal::TypeList< Types >::type>::Register(\
-              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types, ...)      \
+  static bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ =       \
+      ::testing::internal::TypeParameterizedTestCase<                     \
+          CaseName, GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_,     \
+          ::testing::internal::TypeList< Types >::type>::                 \
+          Register(#Prefix,                                               \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_CASE_P_STATE_(CaseName), #CaseName,  \
+                   GTEST_REGISTERED_TEST_NAMES_(CaseName),                \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
+                       ::testing::internal::TypeList< Types >::type>())
 
 #endif  // GTEST_HAS_TYPED_TEST_P
 
 #endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // Depending on the platform, different string classes are available.
 // On Linux, in addition to ::std::string, Google also makes use of
 // class ::string, which has the same interface as ::std::string, but
@@ -18117,6 +19492,15 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
 
 namespace testing {
 
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4805)
+# pragma warning(disable:4100)
+#endif
+
+
 // Declares the flags.
 
 // This flag temporary enables the disabled tests.
@@ -18138,6 +19522,10 @@ GTEST_DECLARE_string_(color);
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
 
+// This flag controls whether Google Test installs a signal handler that dumps
+// debugging information when fatal signals are raised.
+GTEST_DECLARE_bool_(install_failure_signal_handler);
+
 // This flag causes the Google Test to list tests. None of the tests listed
 // are actually run if the flag is provided.
 GTEST_DECLARE_bool_(list_tests);
@@ -18150,6 +19538,9 @@ GTEST_DECLARE_string_(output);
 // test.
 GTEST_DECLARE_bool_(print_time);
 
+// This flags control whether Google Test prints UTF8 characters as text.
+GTEST_DECLARE_bool_(print_utf8);
+
 // This flag specifies the random number seed.
 GTEST_DECLARE_int32_(random_seed);
 
@@ -18170,7 +19561,7 @@ GTEST_DECLARE_int32_(stack_trace_depth);
 
 // When this flag is specified, a failed assertion will throw an
 // exception if exceptions are enabled, or exit the program with a
-// non-zero code otherwise.
+// non-zero code otherwise. For use with an external test framework.
 GTEST_DECLARE_bool_(throw_on_failure);
 
 // When this flag is set with a "host:port" string, on supported
@@ -18178,6 +19569,10 @@ GTEST_DECLARE_bool_(throw_on_failure);
 // the specified host machine.
 GTEST_DECLARE_string_(stream_result_to);
 
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DECLARE_string_(flagfile);
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
 // The upper limit for valid stack trace depths.
 const int kMaxStackTraceDepth = 100;
 
@@ -18195,6 +19590,7 @@ class TestEventListenersAccessor;
 class TestEventRepeater;
 class UnitTestRecordPropertyTestHelper;
 class WindowsDeathTest;
+class FuchsiaDeathTest;
 class UnitTestImpl* GetUnitTestImpl();
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
                                     const std::string& message);
@@ -18294,7 +19690,9 @@ class GTEST_API_ AssertionResult {
   // Used in EXPECT_TRUE/FALSE(assertion_result).
   AssertionResult(const AssertionResult& other);
 
+#if defined(_MSC_VER) && _MSC_VER < 1910
   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
 
   // Used in the EXPECT_TRUE/FALSE(bool_expression).
   //
@@ -18311,7 +19709,9 @@ class GTEST_API_ AssertionResult {
           /*enabler*/ = NULL)
       : success_(success) {}
 
+#if defined(_MSC_VER) && _MSC_VER < 1910
   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
 
   // Assignment operator.
   AssertionResult& operator=(AssertionResult other) {
@@ -18332,7 +19732,7 @@ class GTEST_API_ AssertionResult {
   const char* message() const {
     return message_.get() != NULL ?  message_->c_str() : "";
   }
-  // TODO(vladl@google.com): Remove this after making sure no clients use it.
+  // FIXME: Remove this after making sure no clients use it.
   // Deprecated; please use message() instead.
   const char* failure_message() const { return message(); }
 
@@ -18380,628 +19780,1005 @@ GTEST_API_ AssertionResult AssertionFailure();
 // Deprecated; use AssertionFailure() << msg.
 GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
 
-// The abstract class that all tests inherit from.
-//
-// In Google Test, a unit test program contains one or many TestCases, and
-// each TestCase contains one or many Tests.
-//
-// When you define a test using the TEST macro, you don't need to
-// explicitly derive from Test - the TEST macro automatically does
-// this for you.
+}  // namespace testing
+
+// Includes the auto-generated header that implements a family of generic
+// predicate assertion macros. This include comes late because it relies on
+// APIs declared above.
+// Copyright 2006, Google Inc.
+// All rights reserved.
 //
-// The only time you derive from Test is when defining a test fixture
-// to be used a TEST_F.  For example:
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
 //
-//   class FooTest : public testing::Test {
-//    protected:
-//     void SetUp() override { ... }
-//     void TearDown() override { ... }
-//     ...
-//   };
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
 //
-//   TEST_F(FooTest, Bar) { ... }
-//   TEST_F(FooTest, Baz) { ... }
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 01/02/2018 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
 //
-// Test is not copyable.
-class GTEST_API_ Test {
- public:
-  friend class TestInfo;
+// Implements a family of generic predicate assertion macros.
 
-  // Defines types for pointers to functions that set up and tear down
-  // a test case.
-  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
-  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
+// GOOGLETEST_CM0001 DO NOT DELETE
 
-  // The d'tor is virtual as we intend to inherit from Test.
-  virtual ~Test();
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-  // Sets up the stuff shared by all tests in this test case.
-  //
-  // Google Test will call Foo::SetUpTestCase() before running the first
-  // test in test case Foo.  Hence a sub-class can define its own
-  // SetUpTestCase() method to shadow the one defined in the super
-  // class.
-  static void SetUpTestCase() {}
 
-  // Tears down the stuff shared by all tests in this test case.
-  //
-  // Google Test will call Foo::TearDownTestCase() after running the last
-  // test in test case Foo.  Hence a sub-class can define its own
-  // TearDownTestCase() method to shadow the one defined in the super
-  // class.
-  static void TearDownTestCase() {}
+namespace testing {
 
-  // Returns true iff the current test has a fatal failure.
-  static bool HasFatalFailure();
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
 
-  // Returns true iff the current test has a non-fatal failure.
-  static bool HasNonfatalFailure();
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
 
-  // Returns true iff the current test has a (either fatal or
-  // non-fatal) failure.
-  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
 
-  // Logs a property for the current test, test case, or for the entire
-  // invocation of the test program when used outside of the context of a
-  // test case.  Only the last value for a given key is remembered.  These
-  // are public static so they can be called from utility functions that are
-  // not members of the test fixture.  Calls to RecordProperty made during
-  // lifespan of the test (from the moment its constructor starts to the
-  // moment its destructor finishes) will be output in XML as attributes of
-  // the <testcase> element.  Properties recorded from fixture's
-  // SetUpTestCase or TearDownTestCase are logged as attributes of the
-  // corresponding <testsuite> element.  Calls to RecordProperty made in the
-  // global context (before or after invocation of RUN_ALL_TESTS and from
-  // SetUp/TearDown method of Environment objects registered with Google
-  // Test) will be output as attributes of the <testsuites> element.
-  static void RecordProperty(const std::string& key, const std::string& value);
-  static void RecordProperty(const std::string& key, int value);
-
- protected:
-  // Creates a Test object.
-  Test();
-
-  // Sets up the test fixture.
-  virtual void SetUp();
-
-  // Tears down the test fixture.
-  virtual void TearDown();
 
- private:
-  // Returns true iff the current test has the same fixture class as
-  // the first test in the current test case.
-  static bool HasSameFixtureClass();
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
 
-  // Runs the test after the test fixture has been set up.
-  //
-  // A sub-class must implement this to define the test logic.
-  //
-  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
-  // Instead, use the TEST or TEST_F macro.
-  virtual void TestBody() = 0;
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1;
+}
 
-  // Sets up, executes, and tears down the test.
-  void Run();
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
 
-  // Deletes self.  We deliberately pick an unusual name for this
-  // internal method to avoid clashing with names used in user TESTs.
-  void DeleteSelf_() { delete this; }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
 
-  // Uses a GTestFlagSaver to save and restore all Google Test flags.
-  const internal::GTestFlagSaver* const gtest_flag_saver_;
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
 
-  // Often a user misspells SetUp() as Setup() and spends a long time
-  // wondering why it is never called by Google Test.  The declaration of
-  // the following method is solely for catching such an error at
-  // compile time:
-  //
-  //   - The return type is deliberately chosen to be not void, so it
-  //   will be a conflict if void Setup() is declared in the user's
-  //   test fixture.
-  //
-  //   - This method is private, so it will be another compiler error
-  //   if the method is called from the user's test fixture.
-  //
-  // DO NOT OVERRIDE THIS FUNCTION.
-  //
-  // If you see an error about overriding the following function or
-  // about it being private, you have mis-spelled SetUp() as Setup().
-  struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
 
-  // We disallow copying Tests.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
-};
 
-typedef internal::TimeInMillis TimeInMillis;
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
 
-// A copyable object representing a user specified test property which can be
-// output as a key/value string pair.
-//
-// Don't inherit from TestProperty as its destructor is not virtual.
-class TestProperty {
- public:
-  // C'tor.  TestProperty does NOT have a default constructor.
-  // Always use this constructor (with parameters) to create a
-  // TestProperty object.
-  TestProperty(const std::string& a_key, const std::string& a_value) :
-    key_(a_key), value_(a_value) {
-  }
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2;
+}
 
-  // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
 
-  // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
 
-  // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string& new_value) {
-    value_ = new_value;
-  }
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
 
- private:
-  // The key supplied by the user.
-  std::string key_;
-  // The value supplied by the user.
-  std::string value_;
-};
 
-// The result of a single Test.  This includes a list of
-// TestPartResults, a list of TestProperties, a count of how many
-// death tests there are in the Test, and how much time it took to run
-// the Test.
-//
-// TestResult is not copyable.
-class GTEST_API_ TestResult {
- public:
-  // Creates an empty TestResult.
-  TestResult();
 
-  // D'tor.  Do not inherit from TestResult.
-  ~TestResult();
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
 
-  // Gets the number of all test parts.  This is the sum of the number
-  // of successful test parts and the number of failed test parts.
-  int total_part_count() const;
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3;
+}
 
-  // Returns the number of the test properties.
-  int test_property_count() const;
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
 
-  // Returns true iff the test passed (i.e. no test part failed).
-  bool Passed() const { return !Failed(); }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
 
-  // Returns true iff the test failed.
-  bool Failed() const;
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
 
-  // Returns true iff the test fatally failed.
-  bool HasFatalFailure() const;
 
-  // Returns true iff the test has a non-fatal failure.
-  bool HasNonfatalFailure() const;
 
-  // Returns the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
 
-  // Returns the i-th test part result among all the results. i can range
-  // from 0 to test_property_count() - 1. If i is not in that range, aborts
-  // the program.
-  const TestPartResult& GetTestPartResult(int i) const;
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4;
+}
 
-  // Returns the i-th test property. i can range from 0 to
-  // test_property_count() - 1. If i is not in that range, aborts the
-  // program.
-  const TestProperty& GetTestProperty(int i) const;
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
 
- private:
-  friend class TestInfo;
-  friend class TestCase;
-  friend class UnitTest;
-  friend class internal::DefaultGlobalTestPartResultReporter;
-  friend class internal::ExecDeathTest;
-  friend class internal::TestResultAccessor;
-  friend class internal::UnitTestImpl;
-  friend class internal::WindowsDeathTest;
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
 
-  // Gets the vector of TestPartResults.
-  const std::vector<TestPartResult>& test_part_results() const {
-    return test_part_results_;
-  }
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
 
-  // Gets the vector of TestProperties.
-  const std::vector<TestProperty>& test_properties() const {
-    return test_properties_;
-  }
 
-  // Sets the elapsed time.
-  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
 
-  // Adds a test property to the list. The property is validated and may add
-  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
-  // key names). If a property is already recorded for the same key, the
-  // value will be updated, rather than storing multiple values for the same
-  // key.  xml_element specifies the element for which the property is being
-  // recorded and is used for validation.
-  void RecordProperty(const std::string& xml_element,
-                      const TestProperty& test_property);
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
 
-  // Adds a failure if the key is a reserved attribute of Google Test
-  // testcase tags.  Returns true if the property is valid.
-  // TODO(russr): Validate attribute names are legal and human readable.
-  static bool ValidateTestProperty(const std::string& xml_element,
-                                   const TestProperty& test_property);
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ", "
+                            << e5 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4
+                            << "\n" << e5 << " evaluates to " << v5;
+}
 
-  // Adds a test part result to the list.
-  void AddTestPartResult(const TestPartResult& test_part_result);
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
 
-  // Returns the death test count.
-  int death_test_count() const { return death_test_count_; }
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
 
-  // Increments the death test count, returning the new count.
-  int increment_death_test_count() { return ++death_test_count_; }
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
 
-  // Clears the test part results.
-  void ClearTestPartResults();
 
-  // Clears the object.
-  void Clear();
 
-  // Protects mutable state of the property vector and of owned
-  // properties, whose values may be updated.
-  internal::Mutex test_properites_mutex_;
+}  // namespace testing
 
-  // The vector of TestPartResults
-  std::vector<TestPartResult> test_part_results_;
-  // The vector of TestProperties
-  std::vector<TestProperty> test_properties_;
-  // Running count of death tests.
-  int death_test_count_;
-  // The elapsed time, in milliseconds.
-  TimeInMillis elapsed_time_;
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-  // We disallow copying TestResult.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
-};  // class TestResult
+namespace testing {
 
-// A TestInfo object stores the following information about a test:
+// The abstract class that all tests inherit from.
 //
-//   Test case name
-//   Test name
-//   Whether the test should be run
-//   A function pointer that creates the test object when invoked
-//   Test result
+// In Google Test, a unit test program contains one or many TestCases, and
+// each TestCase contains one or many Tests.
 //
-// The constructor of TestInfo registers itself with the UnitTest
-// singleton such that the RUN_ALL_TESTS() macro knows which tests to
-// run.
-class GTEST_API_ TestInfo {
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used in a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
  public:
-  // Destructs a TestInfo object.  This function is not virtual, so
-  // don't inherit from TestInfo.
-  ~TestInfo();
-
-  // Returns the test case name.
-  const char* test_case_name() const { return test_case_name_.c_str(); }
-
-  // Returns the test name.
-  const char* name() const { return name_.c_str(); }
+  friend class TestInfo;
 
-  // Returns the name of the parameter type, or NULL if this is not a typed
-  // or a type-parameterized test.
-  const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
-  }
+  // Defines types for pointers to functions that set up and tear down
+  // a test case.
+  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
+  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
 
-  // Returns the text representation of the value parameter, or NULL if this
-  // is not a value-parameterized test.
-  const char* value_param() const {
-    if (value_param_.get() != NULL)
-      return value_param_->c_str();
-    return NULL;
-  }
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
 
-  // Returns true if this test should run, that is if the test is not
-  // disabled (or it is disabled but the also_run_disabled_tests flag has
-  // been specified) and its full name matches the user-specified filter.
-  //
-  // Google Test allows the user to filter the tests by their full names.
-  // The full name of a test Bar in test case Foo is defined as
-  // "Foo.Bar".  Only the tests that match the filter will run.
+  // Sets up the stuff shared by all tests in this test case.
   //
-  // A filter is a colon-separated list of glob (not regex) patterns,
-  // optionally followed by a '-' and a colon-separated list of
-  // negative patterns (tests to exclude).  A test is run if it
-  // matches one of the positive patterns and does not match any of
-  // the negative patterns.
+  // Google Test will call Foo::SetUpTestCase() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestCase() method to shadow the one defined in the super
+  // class.
+  static void SetUpTestCase() {}
+
+  // Tears down the stuff shared by all tests in this test case.
   //
-  // For example, *A*:Foo.* is a filter that matches any string that
-  // contains the character 'A' or starts with "Foo.".
-  bool should_run() const { return should_run_; }
+  // Google Test will call Foo::TearDownTestCase() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestCase() method to shadow the one defined in the super
+  // class.
+  static void TearDownTestCase() {}
 
-  // Returns true iff this test will appear in the XML report.
-  bool is_reportable() const {
-    // For now, the XML report includes all tests matching the filter.
-    // In the future, we may trim tests that are excluded because of
-    // sharding.
-    return matches_filter_;
-  }
+  // Returns true iff the current test has a fatal failure.
+  static bool HasFatalFailure();
 
-  // Returns the result of the test.
-  const TestResult* result() const { return &result_; }
+  // Returns true iff the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
 
- private:
-#if GTEST_HAS_DEATH_TEST
-  friend class internal::DefaultDeathTestFactory;
-#endif  // GTEST_HAS_DEATH_TEST
-  friend class Test;
-  friend class TestCase;
-  friend class internal::UnitTestImpl;
-  friend class internal::StreamingListenerTest;
-  friend TestInfo* internal::MakeAndRegisterTestInfo(
-      const char* test_case_name,
-      const char* name,
-      const char* type_param,
-      const char* value_param,
-      internal::TypeId fixture_class_id,
-      Test::SetUpTestCaseFunc set_up_tc,
-      Test::TearDownTestCaseFunc tear_down_tc,
-      internal::TestFactoryBase* factory);
+  // Returns true iff the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
 
-  // Constructs a TestInfo object. The newly constructed instance assumes
-  // ownership of the factory object.
-  TestInfo(const std::string& test_case_name,
-           const std::string& name,
-           const char* a_type_param,   // NULL if not a type-parameterized test
-           const char* a_value_param,  // NULL if not a value-parameterized test
-           internal::TypeId fixture_class_id,
-           internal::TestFactoryBase* factory);
+  // Logs a property for the current test, test case, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test case.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
 
-  // Increments the number of death tests encountered in this test so
-  // far.
-  int increment_death_test_count() {
-    return result_.increment_death_test_count();
-  }
+ protected:
+  // Creates a Test object.
+  Test();
 
-  // Creates the test object, runs it, records its result, and then
-  // deletes it.
-  void Run();
+  // Sets up the test fixture.
+  virtual void SetUp();
 
-  static void ClearTestResult(TestInfo* test_info) {
-    test_info->result_.Clear();
-  }
+  // Tears down the test fixture.
+  virtual void TearDown();
 
-  // These fields are immutable properties of the test.
-  const std::string test_case_name_;     // Test case name
-  const std::string name_;               // Test name
-  // Name of the parameter type, or NULL if this is not a typed or a
-  // type-parameterized test.
-  const internal::scoped_ptr<const ::std::string> type_param_;
-  // Text representation of the value parameter, or NULL if this is not a
-  // value-parameterized test.
-  const internal::scoped_ptr<const ::std::string> value_param_;
-  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
-  bool should_run_;                 // True iff this test should run
-  bool is_disabled_;                // True iff this test is disabled
-  bool matches_filter_;             // True if this test matches the
-                                    // user-specified filter.
-  internal::TestFactoryBase* const factory_;  // The factory that creates
-                                              // the test object
+ private:
+  // Returns true iff the current test has the same fixture class as
+  // the first test in the current test case.
+  static bool HasSameFixtureClass();
 
-  // This field is mutable and needs to be reset before running the
-  // test for the second time.
-  TestResult result_;
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
-};
+  // Sets up, executes, and tears down the test.
+  void Run();
 
-// A test case, which consists of a vector of TestInfos.
-//
-// TestCase is not copyable.
-class GTEST_API_ TestCase {
- public:
-  // Creates a TestCase with the given name.
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_;
+
+  // Often a user misspells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
   //
-  // TestCase does NOT have a default constructor.  Always use this
-  // constructor to create a TestCase object.
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
   //
-  // Arguments:
+  //   - This method is private, so it will be another compiler error
+  //   if the method is called from the user's test fixture.
   //
-  //   name:         name of the test case
-  //   a_type_param: the name of the test's type parameter, or NULL if
-  //                 this is not a type-parameterized test.
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
-  TestCase(const char* name, const char* a_type_param,
-           Test::SetUpTestCaseFunc set_up_tc,
-           Test::TearDownTestCaseFunc tear_down_tc);
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
 
-  // Destructor of TestCase.
-  virtual ~TestCase();
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
 
-  // Gets the name of the TestCase.
-  const char* name() const { return name_.c_str(); }
+typedef internal::TimeInMillis TimeInMillis;
 
-  // Returns the name of the parameter type, or NULL if this is not a
-  // type-parameterized test case.
-  const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
   }
 
-  // Returns true if any test in this test case should run.
-  bool should_run() const { return should_run_; }
+  // Gets the user supplied key.
+  const char* key() const {
+    return key_.c_str();
+  }
 
-  // Gets the number of successful tests in this test case.
-  int successful_test_count() const;
+  // Gets the user supplied value.
+  const char* value() const {
+    return value_.c_str();
+  }
 
-  // Gets the number of failed tests in this test case.
-  int failed_test_count() const;
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
 
-  // Gets the number of disabled tests that will be reported in the XML report.
-  int reportable_disabled_test_count() const;
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
 
-  // Gets the number of disabled tests in this test case.
-  int disabled_test_count() const;
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
 
-  // Gets the number of tests to be printed in the XML report.
-  int reportable_test_count() const;
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
 
-  // Get the number of tests in this test case that should run.
-  int test_to_run_count() const;
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
 
-  // Gets the number of all tests in this test case.
-  int total_test_count() const;
+  // Returns the number of the test properties.
+  int test_property_count() const;
 
-  // Returns true iff the test case passed.
+  // Returns true iff the test passed (i.e. no test part failed).
   bool Passed() const { return !Failed(); }
 
-  // Returns true iff the test case failed.
-  bool Failed() const { return failed_test_count() > 0; }
+  // Returns true iff the test failed.
+  bool Failed() const;
+
+  // Returns true iff the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true iff the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
 
   // Returns the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
 
-  // Returns the i-th test among all the tests. i can range from 0 to
-  // total_test_count() - 1. If i is not in that range, returns NULL.
-  const TestInfo* GetTestInfo(int i) const;
+  // Returns the i-th test part result among all the results. i can range from 0
+  // to total_part_count() - 1. If i is not in that range, aborts the program.
+  const TestPartResult& GetTestPartResult(int i) const;
 
-  // Returns the TestResult that holds test properties recorded during
-  // execution of SetUpTestCase and TearDownTestCase.
-  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty& GetTestProperty(int i) const;
 
  private:
-  friend class Test;
+  friend class TestInfo;
+  friend class TestCase;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
   friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+  friend class internal::FuchsiaDeathTest;
 
-  // Gets the (mutable) vector of TestInfos in this TestCase.
-  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult>& test_part_results() const {
+    return test_part_results_;
+  }
 
-  // Gets the (immutable) vector of TestInfos in this TestCase.
-  const std::vector<TestInfo*>& test_info_list() const {
-    return test_info_list_;
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty>& test_properties() const {
+    return test_properties_;
   }
 
-  // Returns the i-th test among all the tests. i can range from 0 to
-  // total_test_count() - 1. If i is not in that range, returns NULL.
-  TestInfo* GetMutableTestInfo(int i);
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
 
-  // Sets the should_run member.
-  void set_should_run(bool should) { should_run_ = should; }
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
 
-  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
-  // destruction of the TestCase object.
-  void AddTestInfo(TestInfo * test_info);
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testcase tags.  Returns true if the property is valid.
+  // FIXME: Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
 
-  // Clears the results of all tests in this test case.
-  void ClearResult();
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult& test_part_result);
 
-  // Clears the results of all tests in the given test case.
-  static void ClearTestCaseResult(TestCase* test_case) {
-    test_case->ClearResult();
-  }
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
 
-  // Runs every test in this TestCase.
-  void Run();
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
 
-  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
-  // for catching exceptions thrown from SetUpTestCase().
-  void RunSetUpTestCase() { (*set_up_tc_)(); }
+  // Clears the test part results.
+  void ClearTestPartResults();
 
-  // Runs TearDownTestCase() for this TestCase.  This wrapper is
-  // needed for catching exceptions thrown from TearDownTestCase().
-  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+  // Clears the object.
+  void Clear();
 
-  // Returns true iff test passed.
-  static bool TestPassed(const TestInfo* test_info) {
-    return test_info->should_run() && test_info->result()->Passed();
-  }
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
 
-  // Returns true iff test failed.
-  static bool TestFailed(const TestInfo* test_info) {
-    return test_info->should_run() && test_info->result()->Failed();
-  }
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
 
-  // Returns true iff the test is disabled and will be reported in the XML
-  // report.
-  static bool TestReportableDisabled(const TestInfo* test_info) {
-    return test_info->is_reportable() && test_info->is_disabled_;
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test case name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test case name.
+  const char* test_case_name() const { return test_case_name_.c_str(); }
+
+  // Returns the test name.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
   }
 
-  // Returns true iff test is disabled.
-  static bool TestDisabled(const TestInfo* test_info) {
-    return test_info->is_disabled_;
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char* value_param() const {
+    if (value_param_.get() != NULL)
+      return value_param_->c_str();
+    return NULL;
   }
 
+  // Returns the file name where this test is defined.
+  const char* file() const { return location_.file.c_str(); }
+
+  // Returns the line where this test is defined.
+  int line() const { return location_.line; }
+
+  // Return true if this test should not be run because it's in another shard.
+  bool is_in_another_shard() const { return is_in_another_shard_; }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test case Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
   // Returns true iff this test will appear in the XML report.
-  static bool TestReportable(const TestInfo* test_info) {
-    return test_info->is_reportable();
+  bool is_reportable() const {
+    // The XML report includes tests matching the filter, excluding those
+    // run in other shards.
+    return matches_filter_ && !is_in_another_shard_;
   }
 
-  // Returns true if the given test should run.
-  static bool ShouldRunTest(const TestInfo* test_info) {
-    return test_info->should_run();
+  // Returns the result of the test.
+  const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestCase;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_case_name,
+      const char* name,
+      const char* type_param,
+      const char* value_param,
+      internal::CodeLocation code_location,
+      internal::TypeId fixture_class_id,
+      Test::SetUpTestCaseFunc set_up_tc,
+      Test::TearDownTestCaseFunc tear_down_tc,
+      internal::TestFactoryBase* factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string& test_case_name,
+           const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::CodeLocation a_code_location,
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase* factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
   }
 
-  // Shuffles the tests in this test case.
-  void ShuffleTests(internal::Random* random);
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
 
-  // Restores the test order to before the first shuffle.
-  void UnshuffleTests();
+  static void ClearTestResult(TestInfo* test_info) {
+    test_info->result_.Clear();
+  }
 
-  // Name of the test case.
-  std::string name_;
+  // These fields are immutable properties of the test.
+  const std::string test_case_name_;     // Test case name
+  const std::string name_;               // Test name
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
   const internal::scoped_ptr<const ::std::string> type_param_;
-  // The vector of TestInfos in their original order.  It owns the
-  // elements in the vector.
-  std::vector<TestInfo*> test_info_list_;
-  // Provides a level of indirection for the test list to allow easy
-  // shuffling and restoring the test order.  The i-th element in this
-  // vector is the index of the i-th test in the shuffled test list.
-  std::vector<int> test_indices_;
-  // Pointer to the function that sets up the test case.
-  Test::SetUpTestCaseFunc set_up_tc_;
-  // Pointer to the function that tears down the test case.
-  Test::TearDownTestCaseFunc tear_down_tc_;
-  // True iff any test in this test case should run.
-  bool should_run_;
-  // Elapsed time, in milliseconds.
-  TimeInMillis elapsed_time_;
-  // Holds test properties recorded during execution of SetUpTestCase and
-  // TearDownTestCase.
-  TestResult ad_hoc_test_result_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const internal::scoped_ptr<const ::std::string> value_param_;
+  internal::CodeLocation location_;
+  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
+  bool should_run_;                 // True iff this test should run
+  bool is_disabled_;                // True iff this test is disabled
+  bool matches_filter_;             // True if this test matches the
+                                    // user-specified filter.
+  bool is_in_another_shard_;        // Will be run in another shard.
+  internal::TestFactoryBase* const factory_;  // The factory that creates
+                                              // the test object
 
-  // We disallow copying TestCases.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
 };
 
-// An Environment object is capable of setting up and tearing down an
-// environment.  You should subclass this to define your own
-// environment(s).
-//
-// An Environment object does the set-up and tear-down in virtual
-// methods SetUp() and TearDown() instead of the constructor and the
-// destructor, as:
+// A test case, which consists of a vector of TestInfos.
 //
-//   1. You cannot safely throw from a destructor.  This is a problem
-//      as in some cases Google Test is used where exceptions are enabled, and
-//      we may want to implement ASSERT_* using exceptions where they are
-//      available.
-//   2. You cannot use ASSERT_* directly in a constructor or
-//      destructor.
-class Environment {
+// TestCase is not copyable.
+class GTEST_API_ TestCase {
  public:
-  // The d'tor is virtual as we need to subclass Environment.
-  virtual ~Environment() {}
+  // Creates a TestCase with the given name.
+  //
+  // TestCase does NOT have a default constructor.  Always use this
+  // constructor to create a TestCase object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test case
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  TestCase(const char* name, const char* a_type_param,
+           Test::SetUpTestCaseFunc set_up_tc,
+           Test::TearDownTestCaseFunc tear_down_tc);
 
-  // Override this to define how to set up the environment.
+  // Destructor of TestCase.
+  virtual ~TestCase();
+
+  // Gets the name of the TestCase.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test case.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if any test in this test case should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test case.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests in this test case.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test case.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test case that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test case.
+  int total_test_count() const;
+
+  // Returns true iff the test case passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test case failed.
+  bool Failed() const { return failed_test_count() > 0; }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo* GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestCase and TearDownTestCase.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestCase.
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestCase.
+  const std::vector<TestInfo*>& test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo* GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
+  // destruction of the TestCase object.
+  void AddTestInfo(TestInfo * test_info);
+
+  // Clears the results of all tests in this test case.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test case.
+  static void ClearTestCaseResult(TestCase* test_case) {
+    test_case->ClearResult();
+  }
+
+  // Runs every test in this TestCase.
+  void Run();
+
+  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestCase().
+  void RunSetUpTestCase() { (*set_up_tc_)(); }
+
+  // Runs TearDownTestCase() for this TestCase.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestCase().
+  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+
+  // Returns true iff test passed.
+  static bool TestPassed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true iff test failed.
+  static bool TestFailed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true iff the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true iff test is disabled.
+  static bool TestDisabled(const TestInfo* test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true iff this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo* test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test case.
+  void ShuffleTests(internal::Random* random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test case.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo*> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test case.
+  Test::SetUpTestCaseFunc set_up_tc_;
+  // Pointer to the function that tears down the test case.
+  Test::TearDownTestCaseFunc tear_down_tc_;
+  // True iff any test in this test case should run.
+  bool should_run_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestCase and
+  // TearDownTestCase.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestCases.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  You should subclass this to define your own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
   virtual void SetUp() {}
 
   // Override this to define how to tear down the environment.
@@ -19013,6 +20790,18 @@ class Environment {
   virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
 };
 
+#if GTEST_HAS_EXCEPTIONS
+
+// Exception which can be thrown from TestEventListener::OnTestPartResult.
+class GTEST_API_ AssertionException
+    : public internal::GoogleTestFailureException {
+ public:
+  explicit AssertionException(const TestPartResult& result)
+      : GoogleTestFailureException(result) {}
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
 // The interface for tracing execution of tests. The methods are organized in
 // the order the corresponding events are fired.
 class TestEventListener {
@@ -19041,6 +20830,8 @@ class TestEventListener {
   virtual void OnTestStart(const TestInfo& test_info) = 0;
 
   // Fired after a failed assertion or a SUCCEED() invocation.
+  // If you want to throw an exception from this function to skip to the next
+  // TEST, it must be AssertionException defined above, or inherited from it.
   virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
 
   // Fired after the test ends.
@@ -19207,14 +20998,12 @@ class GTEST_API_ UnitTest {
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
 
-#if GTEST_HAS_PARAM_TEST
   // Returns the ParameterizedTestCaseRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
   //
   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
   internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
       GTEST_LOCK_EXCLUDED_(mutex_);
-#endif  // GTEST_HAS_PARAM_TEST
 
   // Gets the number of successful test cases.
   int successful_test_case_count() const;
@@ -19314,11 +21103,11 @@ class GTEST_API_ UnitTest {
   internal::UnitTestImpl* impl() { return impl_; }
   const internal::UnitTestImpl* impl() const { return impl_; }
 
-  // These classes and funcions are friends as they need to access private
+  // These classes and functions are friends as they need to access private
   // members of UnitTest.
+  friend class ScopedTrace;
   friend class Test;
   friend class internal::AssertHelper;
-  friend class internal::ScopedTrace;
   friend class internal::StreamingListenerTest;
   friend class internal::UnitTestRecordPropertyTestHelper;
   friend Environment* AddGlobalTestEnvironment(Environment* env);
@@ -19395,172 +21184,70 @@ GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
 
 namespace internal {
 
-// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
-// value of type ToPrint that is an operand of a comparison assertion
-// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
-// the comparison, and is used to help determine the best way to
-// format the value.  In particular, when the value is a C string
-// (char pointer) and the other operand is an STL string object, we
-// want to format the C string as a string, since we know it is
-// compared by value with the string object.  If the value is a char
-// pointer but the other operand is not an STL string object, we don't
-// know whether the pointer is supposed to point to a NUL-terminated
-// string, and thus want to print it as a pointer to be safe.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const T1& lhs, const T2& rhs) {
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
+}
 
-// The default case.
-template <typename ToPrint, typename OtherOperand>
-class FormatForComparison {
- public:
-  static ::std::string Format(const ToPrint& value) {
-    return ::testing::PrintToString(value);
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            const T1& lhs,
+                            const T2& rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
   }
-};
 
-// Array.
-template <typename ToPrint, size_t N, typename OtherOperand>
-class FormatForComparison<ToPrint[N], OtherOperand> {
+  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
+                                       const char* rhs_expression,
+                                       BiggestInt lhs,
+                                       BiggestInt rhs);
+
+// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
+// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
+// is a null pointer literal.  The following default implementation is
+// for lhs_is_null_literal being false.
+template <bool lhs_is_null_literal>
+class EqHelper {
  public:
-  static ::std::string Format(const ToPrint* value) {
-    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  // This templatized version is for the general case.
+  template <typename T1, typename T2>
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 const T1& lhs,
+                                 const T2& rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
-};
-
-// By default, print C string as pointers to be safe, as we don't know
-// whether they actually point to a NUL-terminated string.
 
-#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
-  template <typename OtherOperand>                                      \
-  class FormatForComparison<CharType*, OtherOperand> {                  \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(static_cast<const void*>(value)); \
-    }                                                                   \
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 BiggestInt lhs,
+                                 BiggestInt rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
-
-// If a C string is compared with an STL string object, we know it's meant
-// to point to a NUL-terminated string, and thus can print it as a string.
-
-#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
-  }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
-
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
-#endif
-
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
-#endif
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
-#endif
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
-
-// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
-// operand to be used in a failure message.  The type (but not value)
-// of the other operand may affect the format.  This allows us to
-// print a char* as a raw pointer when it is compared against another
-// char* or void*, and print it as a C string when it is compared
-// against an std::string object, for example.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
-  return FormatForComparison<T1, T2>::Format(value);
-}
-
-// Separate the error generating code from the code path to reduce the stack
-// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
-// when calling EXPECT_* in a tight loop.
-template <typename T1, typename T2>
-AssertionResult CmpHelperEQFailure(const char* expected_expression,
-                                   const char* actual_expression,
-                                   const T1& expected, const T2& actual) {
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
-}
-
-// The helper function for {ASSERT|EXPECT}_EQ.
-template <typename T1, typename T2>
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            const T1& expected,
-                            const T2& actual) {
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */)
-  if (expected == actual) {
-    return AssertionSuccess();
-  }
-GTEST_DISABLE_MSC_WARNINGS_POP_()
-
-  return CmpHelperEQFailure(expected_expression, actual_expression, expected,
-                            actual);
-}
-
-// With this overloaded version, we allow anonymous enums to be used
-// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
-// can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
-                                       const char* actual_expression,
-                                       BiggestInt expected,
-                                       BiggestInt actual);
-
-// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
-// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
-// is a null pointer literal.  The following default implementation is
-// for lhs_is_null_literal being false.
-template <bool lhs_is_null_literal>
-class EqHelper {
- public:
-  // This templatized version is for the general case.
-  template <typename T1, typename T2>
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 const T1& expected,
-                                 const T2& actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
-  }
-
-  // With this overloaded version, we allow anonymous enums to be used
-  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
-  // enums can be implicitly cast to BiggestInt.
-  //
-  // Even though its body looks the same as the above version, we
-  // cannot merge the two, as it will make anonymous enums unhappy.
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 BiggestInt expected,
-                                 BiggestInt actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
-  }
-};
+};
 
 // This specialization is used when the first argument to ASSERT_EQ()
 // is a null pointer literal, like NULL, false, or 0.
@@ -19573,37 +21260,36 @@ class EqHelper<true> {
   // EXPECT_EQ(false, a_bool).
   template <typename T1, typename T2>
   static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
-      const T1& expected,
-      const T2& actual,
+      const char* lhs_expression,
+      const char* rhs_expression,
+      const T1& lhs,
+      const T2& rhs,
       // The following line prevents this overload from being considered if T2
       // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
       // expands to Compare("", "", NULL, my_ptr), which requires a conversion
       // to match the Secret* in the other overload, which would otherwise make
       // this template match better.
       typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
 
   // This version will be picked when the second argument to ASSERT_EQ() is a
   // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
   template <typename T>
   static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
+      const char* lhs_expression,
+      const char* rhs_expression,
       // We used to have a second template parameter instead of Secret*.  That
       // template parameter would deduce to 'long', making this a better match
       // than the first overload even without the first overload's EnableIf.
       // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
       // non-pointer argument" (even a deduced integral argument), so the old
       // implementation caused warnings in user code.
-      Secret* /* expected (NULL) */,
-      T* actual) {
-    // We already know that 'expected' is a null pointer.
-    return CmpHelperEQ(expected_expression, actual_expression,
-                       static_cast<T*>(NULL), actual);
+      Secret* /* lhs (NULL) */,
+      T* rhs) {
+    // We already know that 'lhs' is a null pointer.
+    return CmpHelperEQ(lhs_expression, rhs_expression,
+                       static_cast<T*>(NULL), rhs);
   }
 };
 
@@ -19662,18 +21348,18 @@ GTEST_IMPL_CMP_HELPER_(GT, >);
 // The helper function for {ASSERT|EXPECT}_STREQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const char* expected,
-                                          const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                              const char* actual_expression,
-                                              const char* expected,
-                                              const char* actual);
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
@@ -19695,10 +21381,10 @@ GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const wchar_t* expected,
-                                          const wchar_t* actual);
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
@@ -19756,28 +21442,28 @@ namespace internal {
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename RawType>
-AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
-                                         const char* actual_expression,
-                                         RawType expected,
-                                         RawType actual) {
-  const FloatingPoint<RawType> lhs(expected), rhs(actual);
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value,
+                                         RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
     return AssertionSuccess();
   }
 
-  ::std::stringstream expected_ss;
-  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-              << expected;
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
 
-  ::std::stringstream actual_ss;
-  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-            << actual;
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
 
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   StringStreamToString(&expected_ss),
-                   StringStreamToString(&actual_ss),
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   StringStreamToString(&lhs_ss),
+                   StringStreamToString(&rhs_ss),
                    false);
 }
 
@@ -19834,7 +21520,6 @@ class GTEST_API_ AssertHelper {
 
 }  // namespace internal
 
-#if GTEST_HAS_PARAM_TEST
 // The pure interface class that all value-parameterized tests inherit from.
 // A value-parameterized class must inherit from both ::testing::Test and
 // ::testing::WithParamInterface. In most cases that just means inheriting
@@ -19880,492 +21565,129 @@ class WithParamInterface {
   // references static data, to reduce the opportunity for incorrect uses
   // like writing 'WithParamInterface<bool>::GetParam()' for a test that
   // uses a fixture whose parameter type is int.
-  const ParamType& GetParam() const {
-    GTEST_CHECK_(parameter_ != NULL)
-        << "GetParam() can only be called inside a value-parameterized test "
-        << "-- did you intend to write TEST_P instead of TEST_F?";
-    return *parameter_;
-  }
-
- private:
-  // Sets parameter value. The caller is responsible for making sure the value
-  // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
-
-  // Static value used for accessing parameter during a test lifetime.
-  static const ParamType* parameter_;
-
-  // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
-};
-
-template <typename T>
-const T* WithParamInterface<T>::parameter_ = NULL;
-
-// Most value-parameterized classes can ignore the existence of
-// WithParamInterface, and can just inherit from ::testing::TestWithParam.
-
-template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
-
-#endif  // GTEST_HAS_PARAM_TEST
-
-// Macros for indicating success/failure in test code.
-
-// ADD_FAILURE unconditionally adds a failure to the current test.
-// SUCCEED generates a success - it doesn't automatically make the
-// current test successful, as a test is only successful when it has
-// no failure.
-//
-// EXPECT_* verifies that a certain condition is satisfied.  If not,
-// it behaves like ADD_FAILURE.  In particular:
-//
-//   EXPECT_TRUE  verifies that a Boolean condition is true.
-//   EXPECT_FALSE verifies that a Boolean condition is false.
-//
-// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
-// that they will also abort the current function on failure.  People
-// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
-// writing data-driven tests often find themselves using ADD_FAILURE
-// and EXPECT_* more.
-
-// Generates a nonfatal failure with a generic message.
-#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
-
-// Generates a nonfatal failure at the given source file location with
-// a generic message.
-#define ADD_FAILURE_AT(file, line) \
-  GTEST_MESSAGE_AT_(file, line, "Failed", \
-                    ::testing::TestPartResult::kNonFatalFailure)
-
-// Generates a fatal failure with a generic message.
-#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
-
-// Define this macro to 1 to omit the definition of FAIL(), which is a
-// generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
-#endif
-
-// Generates a success with a generic message.
-#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
-
-// Define this macro to 1 to omit the definition of SUCCEED(), which
-// is a generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
-#endif
-
-// Macros for testing exceptions.
-//
-//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
-//         Tests that the statement throws the expected exception.
-//    * {ASSERT|EXPECT}_NO_THROW(statement):
-//         Tests that the statement doesn't throw any exception.
-//    * {ASSERT|EXPECT}_ANY_THROW(statement):
-//         Tests that the statement throws an exception.
-
-#define EXPECT_THROW(statement, expected_exception) \
-  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_NO_THROW(statement) \
-  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_ANY_THROW(statement) \
-  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_THROW(statement, expected_exception) \
-  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
-#define ASSERT_NO_THROW(statement) \
-  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
-#define ASSERT_ANY_THROW(statement) \
-  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
-
-// Boolean assertions. Condition can be either a Boolean expression or an
-// AssertionResult. For more information on how to use AssertionResult with
-// these macros see comments on that class.
-#define EXPECT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition) \
-  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
-                      GTEST_NONFATAL_FAILURE_)
-#define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition) \
-  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
-                      GTEST_FATAL_FAILURE_)
-
-// Includes the auto-generated header that implements a family of
-// generic predicate assertion macros.
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
-//
-// Implements a family of generic predicate assertion macros.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-
-// Makes sure this header is not included before gtest.h.
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
-
-// This header implements a family of generic predicate assertion
-// macros:
-//
-//   ASSERT_PRED_FORMAT1(pred_format, v1)
-//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
-//   ...
-//
-// where pred_format is a function or functor that takes n (in the
-// case of ASSERT_PRED_FORMATn) values and their source expression
-// text, and returns a testing::AssertionResult.  See the definition
-// of ASSERT_EQ in gtest.h for an example.
-//
-// If you don't care about formatting, you can use the more
-// restrictive version:
-//
-//   ASSERT_PRED1(pred, v1)
-//   ASSERT_PRED2(pred, v1, v2)
-//   ...
-//
-// where pred is an n-ary function or functor that returns bool,
-// and the values v1, v2, ..., must support the << operator for
-// streaming to std::ostream.
-//
-// We also define the EXPECT_* variations.
-//
-// For now we only support predicates whose arity is at most 5.
-// Please email googletestframework@googlegroups.com if you need
-// support for higher arities.
-
-// GTEST_ASSERT_ is the basic statement to which all of the assertions
-// in this file reduce.  Don't use this in your code.
-
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
-    on_failure(gtest_ar.failure_message())
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
-  if (pred(v1)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
-
-// Unary predicate assertion macros.
-#define EXPECT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2) {
-  if (pred(v1, v2)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
-
-// Binary predicate assertion macros.
-#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
-  if (pred(v1, v2, v3)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
-
-// Ternary predicate assertion macros.
-#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+  const ParamType& GetParam() const {
+    GTEST_CHECK_(parameter_ != NULL)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
 
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
 
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType* parameter_;
 
-// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
-  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
+};
 
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4;
-}
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = NULL;
 
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
 
-// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
 
-// 4-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+// Macros for indicating success/failure in test code.
 
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
 
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
 
-// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
-  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
 
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ", "
-                            << e5 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4
-                            << "\n" << e5 << " evaluates to " << v5;
-}
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
 
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
-                on_failure)
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+# define FAIL() GTEST_FAIL()
+#endif
 
-// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
 
-// 5-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+# define SUCCEED() GTEST_SUCCEED()
+#endif
 
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
 
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
 
 // Macros for testing equalities and inequalities.
 //
-//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
-//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
-//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
-//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
-//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
-//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
+//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
 //
 // When they are not, Google Test prints both the tested expressions and
 // their actual values.  The values must be compatible built-in types,
@@ -20387,8 +21709,8 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //   are related, not how their content is related.  To compare two C
 //   strings by content, use {ASSERT|EXPECT}_STR*().
 //
-//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
-//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
+//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
 //   what the actual value is when it fails, and similarly for the
 //   other comparisons.
 //
@@ -20399,17 +21721,17 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //
 // Examples:
 //
-//   EXPECT_NE(5, Foo());
-//   EXPECT_EQ(NULL, a_pointer);
+//   EXPECT_NE(Foo(), 5);
+//   EXPECT_EQ(a_pointer, NULL);
 //   ASSERT_LT(i, array_size);
 //   ASSERT_GT(records.size(), 0) << "There is no record left.";
 
-#define EXPECT_EQ(expected, actual) \
+#define EXPECT_EQ(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
-#define EXPECT_NE(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
+#define EXPECT_NE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define EXPECT_LE(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
 #define EXPECT_LT(val1, val2) \
@@ -20419,10 +21741,10 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 #define EXPECT_GT(val1, val2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
 
-#define GTEST_ASSERT_EQ(expected, actual) \
+#define GTEST_ASSERT_EQ(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
 #define GTEST_ASSERT_NE(val1, val2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
 #define GTEST_ASSERT_LE(val1, val2) \
@@ -20477,29 +21799,29 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 //
 // These macros evaluate their arguments exactly once.
 
-#define EXPECT_STREQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define EXPECT_STREQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define EXPECT_STRNE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define EXPECT_STRCASEEQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define EXPECT_STRCASEEQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
 #define EXPECT_STRCASENE(s1, s2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
-#define ASSERT_STREQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
+#define ASSERT_STREQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
 #define ASSERT_STRNE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define ASSERT_STRCASEEQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
+#define ASSERT_STRCASEEQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
 #define ASSERT_STRCASENE(s1, s2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
 //
-//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
 //         Tests that two float values are almost equal.
-//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
 //         Tests that two double values are almost equal.
 //    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
 //         Tests that v1 and v2 are within the given distance to each other.
@@ -20509,21 +21831,21 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(expected, actual)\
+#define EXPECT_FLOAT_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define EXPECT_DOUBLE_EQ(expected, actual)\
+#define EXPECT_DOUBLE_EQ(val1, val2)\
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_FLOAT_EQ(expected, actual)\
+#define ASSERT_FLOAT_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
+                      val1, val2)
 
-#define ASSERT_DOUBLE_EQ(expected, actual)\
+#define ASSERT_DOUBLE_EQ(val1, val2)\
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
+                      val1, val2)
 
 #define EXPECT_NEAR(val1, val2, abs_error)\
   EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
@@ -20586,6 +21908,57 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 #define EXPECT_NO_FATAL_FAILURE(statement) \
     GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
 
+// Causes a trace (including the given source file path and line number,
+// and the given message) to be included in every test failure message generated
+// by code in the scope of the lifetime of an instance of this class. The effect
+// is undone with the destruction of the instance.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// Example:
+//   testing::ScopedTrace trace("file.cc", 123, "message");
+//
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+
+  // Template version. Uses Message() to convert the values into strings.
+  // Slow, but flexible.
+  template <typename T>
+  ScopedTrace(const char* file, int line, const T& message) {
+    PushTrace(file, line, (Message() << message).GetString());
+  }
+
+  // Optimize for some known types.
+  ScopedTrace(const char* file, int line, const char* message) {
+    PushTrace(file, line, message ? message : "(null)");
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+  ScopedTrace(const char* file, int line, const ::string& message) {
+    PushTrace(file, line, message);
+  }
+#endif
+
+  ScopedTrace(const char* file, int line, const std::string& message) {
+    PushTrace(file, line, message);
+  }
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  void PushTrace(const char* file, int line, std::string message);
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
 // Causes a trace (including the source file path, the current line
 // number, and the given message) to be included in every test failure
 // message generated by code in the current scope.  The effect is
@@ -20597,9 +21970,14 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 // of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
 // to appear in the same block - as long as they are on different
 // lines.
+//
+// Assuming that each thread maintains its own stack of traces.
+// Therefore, a SCOPED_TRACE() would (correctly) only affect the
+// assertions in its own thread.
 #define SCOPED_TRACE(message) \
-  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, ::testing::Message() << (message))
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, (message))
+
 
 // Compile-time assertion for type equality.
 // StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
@@ -20679,7 +22057,7 @@ bool StaticAssertTypeEq() {
 // name of the test within the test case.
 //
 // A test fixture class must be declared earlier.  The user should put
-// his test code between braces after using this macro.  Example:
+// the test code between braces after using this macro.  Example:
 //
 //   class FooTest : public testing::Test {
 //    protected:
@@ -20694,14 +22072,22 @@ bool StaticAssertTypeEq() {
 //   }
 //
 //   TEST_F(FooTest, ReturnsElementCountCorrectly) {
-//     EXPECT_EQ(0, a_.size());
-//     EXPECT_EQ(1, b_.size());
+//     EXPECT_EQ(a_.size(), 0);
+//     EXPECT_EQ(b_.size(), 1);
 //   }
 
 #define TEST_F(test_fixture, test_name)\
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
 
+// Returns a path to temporary directory.
+// Tries to determine an appropriate directory for the platform.
+GTEST_API_ std::string TempDir();
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
 }  // namespace testing
 
 // Use this function in main() to run all tests.  It returns 0 if all
@@ -20718,4 +22104,6 @@ inline int RUN_ALL_TESTS() {
   return ::testing::UnitTest::GetInstance()->Run();
 }
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc b/third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc
new file mode 100644
index 00000000000..2113f621e65
--- /dev/null
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest_main.cc
@@ -0,0 +1,37 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdio.h>
+#include "gtest/gtest.h"
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from %s\n", __FILE__);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/thirdparty.inc b/thirdparty.inc
index ed9d4c0f8db..25ecdab88c2 100644
--- a/thirdparty.inc
+++ b/thirdparty.inc
@@ -241,7 +241,7 @@ endif()
 
 if (WITH_JEMALLOC)
   message(STATUS "JEMALLOC library is enabled")
-  set(JEMALLOC_CXX_FLAGS "-DROCKSDB_JEMALLOC -DJEMALLOC_EXPORT= ")
+  set(JEMALLOC_CXX_FLAGS "-DROCKSDB_JEMALLOC -DJEMALLOC_EXPORT= -DJEMALLOC_NO_RENAME")
   
   if(DEFINED ENV{JEMALLOC_INCLUDE})
     set(JEMALLOC_INCLUDE $ENV{JEMALLOC_INCLUDE})
diff --git a/tools/block_cache_analyzer/__init__.py b/tools/block_cache_analyzer/__init__.py
new file mode 100644
index 00000000000..8dbe96a7850
--- /dev/null
+++ b/tools/block_cache_analyzer/__init__.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/tools/block_cache_analyzer/block_cache_pysim.py b/tools/block_cache_analyzer/block_cache_pysim.py
new file mode 100644
index 00000000000..67307df5329
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_pysim.py
@@ -0,0 +1,2000 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import gc
+import heapq
+import random
+import sys
+import time
+from collections import OrderedDict
+from os import path
+
+import numpy as np
+
+
+kSampleSize = 64  # The sample size used when performing eviction.
+kMicrosInSecond = 1000000
+kSecondsInMinute = 60
+kSecondsInHour = 3600
+
+
+class TraceRecord:
+    """
+    A trace record represents a block access.
+    It holds the same struct as BlockCacheTraceRecord in
+    trace_replay/block_cache_tracer.h
+    """
+
+    def __init__(
+        self,
+        access_time,
+        block_id,
+        block_type,
+        block_size,
+        cf_id,
+        cf_name,
+        level,
+        fd,
+        caller,
+        no_insert,
+        get_id,
+        key_id,
+        kv_size,
+        is_hit,
+        referenced_key_exist_in_block,
+        num_keys_in_block,
+        table_id,
+        seq_number,
+        block_key_size,
+        key_size,
+        block_offset_in_file,
+        next_access_seq_no,
+    ):
+        self.access_time = access_time
+        self.block_id = block_id
+        self.block_type = block_type
+        self.block_size = block_size + block_key_size
+        self.cf_id = cf_id
+        self.cf_name = cf_name
+        self.level = level
+        self.fd = fd
+        self.caller = caller
+        if no_insert == 1:
+            self.no_insert = True
+        else:
+            self.no_insert = False
+        self.get_id = get_id
+        self.key_id = key_id
+        self.kv_size = kv_size
+        if is_hit == 1:
+            self.is_hit = True
+        else:
+            self.is_hit = False
+        if referenced_key_exist_in_block == 1:
+            self.referenced_key_exist_in_block = True
+        else:
+            self.referenced_key_exist_in_block = False
+        self.num_keys_in_block = num_keys_in_block
+        self.table_id = table_id
+        self.seq_number = seq_number
+        self.block_key_size = block_key_size
+        self.key_size = key_size
+        self.block_offset_in_file = block_offset_in_file
+        self.next_access_seq_no = next_access_seq_no
+
+
+class CacheEntry:
+    """A cache entry stored in the cache."""
+
+    def __init__(
+        self,
+        value_size,
+        cf_id,
+        level,
+        block_type,
+        table_id,
+        access_number,
+        time_s,
+        num_hits=0,
+    ):
+        self.value_size = value_size
+        self.last_access_number = access_number
+        self.num_hits = num_hits
+        self.cf_id = 0
+        self.level = level
+        self.block_type = block_type
+        self.last_access_time = time_s
+        self.insertion_time = time_s
+        self.table_id = table_id
+
+    def __repr__(self):
+        """Debug string."""
+        return "(s={},last={},hits={},cf={},l={},bt={})\n".format(
+            self.value_size,
+            self.last_access_number,
+            self.num_hits,
+            self.cf_id,
+            self.level,
+            self.block_type,
+        )
+
+    def cost_class(self, cost_class_label):
+        if cost_class_label == "table_bt":
+            return "{}-{}".format(self.table_id, self.block_type)
+        elif cost_class_label == "table":
+            return "{}".format(self.table_id)
+        elif cost_class_label == "bt":
+            return "{}".format(self.block_type)
+        elif cost_class_label == "cf":
+            return "{}".format(self.cf_id)
+        elif cost_class_label == "cf_bt":
+            return "{}-{}".format(self.cf_id, self.block_type)
+        elif cost_class_label == "table_level_bt":
+            return "{}-{}-{}".format(self.table_id, self.level, self.block_type)
+        assert False, "Unknown cost class label {}".format(cost_class_label)
+        return None
+
+
+class HashEntry:
+    """A hash entry stored in a hash table."""
+
+    def __init__(self, key, hash, value):
+        self.key = key
+        self.hash = hash
+        self.value = value
+
+    def __repr__(self):
+        return "k={},h={},v=[{}]".format(self.key, self.hash, self.value)
+
+
+class HashTable:
+    """
+    A custom implementation of hash table to support fast random sampling.
+    It is closed hashing and uses chaining to resolve hash conflicts.
+    It grows/shrinks the hash table upon insertion/deletion to support
+    fast lookups and random samplings.
+    """
+
+    def __init__(self):
+        self.initial_size = 32
+        self.table = [None] * self.initial_size
+        self.elements = 0
+
+    def random_sample(self, sample_size):
+        """Randomly sample 'sample_size' hash entries from the table."""
+        samples = []
+        index = random.randint(0, len(self.table) - 1)
+        pos = index
+        # Starting from index, adding hash entries to the sample list until
+        # sample_size is met or we ran out of entries.
+        while True:
+            if self.table[pos] is not None:
+                for i in range(len(self.table[pos])):
+                    if self.table[pos][i] is None:
+                        continue
+                    samples.append(self.table[pos][i])
+                    if len(samples) == sample_size:
+                        break
+            pos += 1
+            pos = pos % len(self.table)
+            if pos == index or len(samples) == sample_size:
+                break
+        assert len(samples) <= sample_size
+        return samples
+
+    def __repr__(self):
+        all_entries = []
+        for i in range(len(self.table)):
+            if self.table[i] is None:
+                continue
+            for j in range(len(self.table[i])):
+                if self.table[i][j] is not None:
+                    all_entries.append(self.table[i][j])
+        return "{}".format(all_entries)
+
+    def values(self):
+        all_values = []
+        for i in range(len(self.table)):
+            if self.table[i] is None:
+                continue
+            for j in range(len(self.table[i])):
+                if self.table[i][j] is not None:
+                    all_values.append(self.table[i][j].value)
+        return all_values
+
+    def __len__(self):
+        return self.elements
+
+    def insert(self, key, hash, value):
+        """
+        Insert a hash entry in the table. Replace the old entry if it already
+        exists.
+        """
+        self.grow()
+        inserted = False
+        index = hash % len(self.table)
+        if self.table[index] is None:
+            self.table[index] = []
+        # Search for the entry first.
+        for i in range(len(self.table[index])):
+            if self.table[index][i] is None:
+                continue
+            if self.table[index][i].hash == hash and self.table[index][i].key == key:
+                # The entry already exists in the table.
+                self.table[index][i] = HashEntry(key, hash, value)
+                return
+
+        # Find an empty slot.
+        for i in range(len(self.table[index])):
+            if self.table[index][i] is None:
+                self.table[index][i] = HashEntry(key, hash, value)
+                inserted = True
+                break
+        if not inserted:
+            self.table[index].append(HashEntry(key, hash, value))
+        self.elements += 1
+
+    def resize(self, new_size):
+        if new_size == len(self.table):
+            return
+        if new_size < self.initial_size:
+            return
+        if self.elements < 100:
+            return
+        new_table = [None] * new_size
+        # Copy 'self.table' to new_table.
+        for i in range(len(self.table)):
+            entries = self.table[i]
+            if entries is None:
+                continue
+            for j in range(len(entries)):
+                if entries[j] is None:
+                    continue
+                index = entries[j].hash % new_size
+                if new_table[index] is None:
+                    new_table[index] = []
+                new_table[index].append(entries[j])
+        self.table = new_table
+        del new_table
+        # Manually call python gc here to free the memory as 'self.table'
+        # might be very large.
+        gc.collect()
+
+    def grow(self):
+        if self.elements < 4 * len(self.table):
+            return
+        new_size = int(len(self.table) * 1.5)
+        self.resize(new_size)
+
+    def delete(self, key, hash):
+        index = hash % len(self.table)
+        deleted = False
+        deleted_entry = None
+        if self.table[index] is None:
+            return
+        for i in range(len(self.table[index])):
+            if (
+                self.table[index][i] is not None
+                and self.table[index][i].hash == hash
+                and self.table[index][i].key == key
+            ):
+                deleted_entry = self.table[index][i]
+                self.table[index][i] = None
+                self.elements -= 1
+                deleted = True
+                break
+        if deleted:
+            self.shrink()
+        return deleted_entry
+
+    def shrink(self):
+        if self.elements * 2 >= len(self.table):
+            return
+        new_size = int(len(self.table) * 0.7)
+        self.resize(new_size)
+
+    def lookup(self, key, hash):
+        index = hash % len(self.table)
+        if self.table[index] is None:
+            return None
+        for i in range(len(self.table[index])):
+            if (
+                self.table[index][i] is not None
+                and self.table[index][i].hash == hash
+                and self.table[index][i].key == key
+            ):
+                return self.table[index][i].value
+        return None
+
+
+class MissRatioStats:
+    def __init__(self, time_unit):
+        self.num_misses = 0
+        self.num_accesses = 0
+        self.time_unit = time_unit
+        self.time_misses = {}
+        self.time_miss_bytes = {}
+        self.time_accesses = {}
+
+    def update_metrics(self, access_time, is_hit, miss_bytes):
+        access_time /= kMicrosInSecond * self.time_unit
+        self.num_accesses += 1
+        if access_time not in self.time_accesses:
+            self.time_accesses[access_time] = 0
+        self.time_accesses[access_time] += 1
+        if not is_hit:
+            self.num_misses += 1
+            if access_time not in self.time_misses:
+                self.time_misses[access_time] = 0
+                self.time_miss_bytes[access_time] = 0
+            self.time_misses[access_time] += 1
+            self.time_miss_bytes[access_time] += miss_bytes
+
+    def reset_counter(self):
+        self.num_misses = 0
+        self.num_accesses = 0
+        self.time_miss_bytes.clear()
+        self.time_misses.clear()
+        self.time_accesses.clear()
+
+    def compute_miss_bytes(self):
+        miss_bytes = []
+        for at in self.time_miss_bytes:
+            miss_bytes.append(self.time_miss_bytes[at])
+        miss_bytes = sorted(miss_bytes)
+        avg_miss_bytes = 0
+        p95_miss_bytes = 0
+        for i in range(len(miss_bytes)):
+            avg_miss_bytes += float(miss_bytes[i]) / float(len(miss_bytes))
+
+        p95_index = min(int(0.95 * float(len(miss_bytes))), len(miss_bytes) - 1)
+        p95_miss_bytes = miss_bytes[p95_index]
+        return avg_miss_bytes, p95_miss_bytes
+
+    def miss_ratio(self):
+        return float(self.num_misses) * 100.0 / float(self.num_accesses)
+
+    def write_miss_timeline(
+        self, cache_type, cache_size, target_cf_name, result_dir, start, end
+    ):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-miss-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-miss-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        with open(file_path, "w+") as file:
+            row = "{}".format(cache_type)
+            for trace_time in range(start, end):
+                row += ",{}".format(self.time_misses.get(trace_time, 0))
+            file.write(row + "\n")
+
+    def write_miss_ratio_timeline(
+        self, cache_type, cache_size, target_cf_name, result_dir, start, end
+    ):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        with open(file_path, "w+") as file:
+            row = "{}".format(cache_type)
+            for trace_time in range(start, end):
+                naccesses = self.time_accesses.get(trace_time, 0)
+                miss_ratio = 0
+                if naccesses > 0:
+                    miss_ratio = float(
+                        self.time_misses.get(trace_time, 0) * 100.0
+                    ) / float(naccesses)
+                row += ",{0:.2f}".format(miss_ratio)
+            file.write(row + "\n")
+
+
+class PolicyStats:
+    def __init__(self, time_unit, policies):
+        self.time_selected_polices = {}
+        self.time_accesses = {}
+        self.policy_names = {}
+        self.time_unit = time_unit
+        for i in range(len(policies)):
+            self.policy_names[i] = policies[i].policy_name()
+
+    def update_metrics(self, access_time, selected_policy):
+        access_time /= kMicrosInSecond * self.time_unit
+        if access_time not in self.time_accesses:
+            self.time_accesses[access_time] = 0
+        self.time_accesses[access_time] += 1
+        if access_time not in self.time_selected_polices:
+            self.time_selected_polices[access_time] = {}
+        policy_name = self.policy_names[selected_policy]
+        if policy_name not in self.time_selected_polices[access_time]:
+            self.time_selected_polices[access_time][policy_name] = 0
+        self.time_selected_polices[access_time][policy_name] += 1
+
+    def write_policy_timeline(
+        self, cache_type, cache_size, target_cf_name, result_dir, start, end
+    ):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-policy-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-policy-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        with open(file_path, "w+") as file:
+            for policy in self.policy_names:
+                policy_name = self.policy_names[policy]
+                row = "{}-{}".format(cache_type, policy_name)
+                for trace_time in range(start, end):
+                    row += ",{}".format(
+                        self.time_selected_polices.get(trace_time, {}).get(
+                            policy_name, 0
+                        )
+                    )
+                file.write(row + "\n")
+
+    def write_policy_ratio_timeline(
+        self, cache_type, cache_size, target_cf_name, file_path, start, end
+    ):
+        start /= kMicrosInSecond * self.time_unit
+        end /= kMicrosInSecond * self.time_unit
+        header_file_path = "{}/header-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        if not path.exists(header_file_path):
+            with open(header_file_path, "w+") as header_file:
+                header = "time"
+                for trace_time in range(start, end):
+                    header += ",{}".format(trace_time)
+                header_file.write(header + "\n")
+        file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}-{}".format(
+            result_dir, self.time_unit, cache_type, cache_size, target_cf_name
+        )
+        with open(file_path, "w+") as file:
+            for policy in self.policy_names:
+                policy_name = self.policy_names[policy]
+                row = "{}-{}".format(cache_type, policy_name)
+                for trace_time in range(start, end):
+                    naccesses = self.time_accesses.get(trace_time, 0)
+                    ratio = 0
+                    if naccesses > 0:
+                        ratio = float(
+                            self.time_selected_polices.get(trace_time, {}).get(
+                                policy_name, 0
+                            )
+                            * 100.0
+                        ) / float(naccesses)
+                    row += ",{0:.2f}".format(ratio)
+                file.write(row + "\n")
+
+
+class Policy(object):
+    """
+    A policy maintains a set of evicted keys. It returns a reward of one to
+    itself if it has not evicted a missing key. Otherwise, it gives itself 0
+    reward.
+    """
+
+    def __init__(self):
+        self.evicted_keys = {}
+
+    def evict(self, key, max_size):
+        self.evicted_keys[key] = 0
+
+    def delete(self, key):
+        self.evicted_keys.pop(key, None)
+
+    def prioritize_samples(self, samples, auxilliary_info):
+        raise NotImplementedError
+
+    def policy_name(self):
+        raise NotImplementedError
+
+    def generate_reward(self, key):
+        if key in self.evicted_keys:
+            return 0
+        return 1
+
+
+class LRUPolicy(Policy):
+    def prioritize_samples(self, samples, auxilliary_info):
+        return sorted(
+            samples,
+            cmp=lambda e1, e2: e1.value.last_access_number
+            - e2.value.last_access_number,
+        )
+
+    def policy_name(self):
+        return "lru"
+
+
+class MRUPolicy(Policy):
+    def prioritize_samples(self, samples, auxilliary_info):
+        return sorted(
+            samples,
+            cmp=lambda e1, e2: e2.value.last_access_number
+            - e1.value.last_access_number,
+        )
+
+    def policy_name(self):
+        return "mru"
+
+
+class LFUPolicy(Policy):
+    def prioritize_samples(self, samples, auxilliary_info):
+        return sorted(samples, cmp=lambda e1, e2: e1.value.num_hits - e2.value.num_hits)
+
+    def policy_name(self):
+        return "lfu"
+
+
+class HyperbolicPolicy(Policy):
+    """
+    An implementation of Hyperbolic caching.
+
+    Aaron Blankstein, Siddhartha Sen, and Michael J. Freedman. 2017.
+    Hyperbolic caching: flexible caching for web applications. In Proceedings
+    of the 2017 USENIX Conference on Usenix Annual Technical Conference
+    (USENIX ATC '17). USENIX Association, Berkeley, CA, USA, 499-511.
+    """
+
+    def compare(self, e1, e2, now):
+        e1_duration = max(0, (now - e1.value.insertion_time) / kMicrosInSecond) * float(
+            e1.value.value_size
+        )
+        e2_duration = max(0, (now - e2.value.insertion_time) / kMicrosInSecond) * float(
+            e2.value.value_size
+        )
+        if e1_duration == e2_duration:
+            return e1.value.num_hits - e2.value.num_hits
+        if e1_duration == 0:
+            return 1
+        if e2_duration == 0:
+            return 1
+        diff = (float(e1.value.num_hits) / (float(e1_duration))) - (
+            float(e2.value.num_hits) / float(e2_duration)
+        )
+        if diff == 0:
+            return 0
+        elif diff > 0:
+            return 1
+        else:
+            return -1
+
+    def prioritize_samples(self, samples, auxilliary_info):
+        assert len(auxilliary_info) == 3
+        now = auxilliary_info[0]
+        return sorted(samples, cmp=lambda e1, e2: self.compare(e1, e2, now))
+
+    def policy_name(self):
+        return "hb"
+
+
+class CostClassPolicy(Policy):
+    """
+    We calculate the hit density of a cost class as
+    number of hits / total size in cache * average duration in the cache.
+
+    An entry has a higher priority if its class's hit density is higher.
+    """
+
+    def compare(self, e1, e2, now, cost_classes, cost_class_label):
+        e1_class = e1.value.cost_class(cost_class_label)
+        e2_class = e2.value.cost_class(cost_class_label)
+
+        assert e1_class in cost_classes
+        assert e2_class in cost_classes
+
+        e1_entry = cost_classes[e1_class]
+        e2_entry = cost_classes[e2_class]
+        e1_density = e1_entry.density(now)
+        e2_density = e2_entry.density(now)
+        e1_hits = cost_classes[e1_class].hits
+        e2_hits = cost_classes[e2_class].hits
+
+        if e1_density == e2_density:
+            return e1_hits - e2_hits
+
+        if e1_entry.num_entries_in_cache == 0:
+            return -1
+        if e2_entry.num_entries_in_cache == 0:
+            return 1
+
+        if e1_density == 0:
+            return 1
+        if e2_density == 0:
+            return -1
+        diff = (float(e1_hits) / float(e1_density)) - (
+            float(e2_hits) / float(e2_density)
+        )
+        if diff == 0:
+            return 0
+        elif diff > 0:
+            return 1
+        else:
+            return -1
+
+    def prioritize_samples(self, samples, auxilliary_info):
+        assert len(auxilliary_info) == 3
+        now = auxilliary_info[0]
+        cost_classes = auxilliary_info[1]
+        cost_class_label = auxilliary_info[2]
+        return sorted(
+            samples,
+            cmp=lambda e1, e2: self.compare(
+                e1, e2, now, cost_classes, cost_class_label
+            ),
+        )
+
+    def policy_name(self):
+        return "cc"
+
+
+class Cache(object):
+    """
+    This is the base class for the implementations of alternative cache
+    replacement policies.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        self.cache_size = cache_size
+        self.used_size = 0
+        self.per_second_miss_ratio_stats = MissRatioStats(1)
+        self.miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+        self.per_hour_miss_ratio_stats = MissRatioStats(kSecondsInHour)
+        # 0: disabled. 1: enabled. Insert both row and the refereneced data block.
+        # 2: enabled. Insert only the row but NOT the referenced data block.
+        self.enable_cache_row_key = enable_cache_row_key
+        self.get_id_row_key_map = {}
+        self.max_seen_get_id = 0
+        self.retain_get_id_range = 100000
+
+    def block_key(self, trace_record):
+        return "b{}".format(trace_record.block_id)
+
+    def row_key(self, trace_record):
+        return "g{}-{}".format(trace_record.fd, trace_record.key_id)
+
+    def _lookup(self, trace_record, key, hash):
+        """
+        Look up the key in the cache.
+        Returns true upon a cache hit, false otherwise.
+        """
+        raise NotImplementedError
+
+    def _evict(self, trace_record, key, hash, value_size):
+        """
+        Evict entries in the cache until there is enough room to insert the new
+        entry with 'value_size'.
+        """
+        raise NotImplementedError
+
+    def _insert(self, trace_record, key, hash, value_size):
+        """
+        Insert the new entry into the cache.
+        """
+        raise NotImplementedError
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        """
+        A custom admission policy to decide whether we should admit the new
+        entry upon a cache miss.
+        Returns true if the new entry should be admitted, false otherwise.
+        """
+        raise NotImplementedError
+
+    def cache_name(self):
+        """
+        The name of the replacement policy.
+        """
+        raise NotImplementedError
+
+    def is_ml_cache(self):
+        return False
+
+    def _update_stats(self, access_time, is_hit, miss_bytes):
+        self.per_second_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+        self.miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+        self.per_hour_miss_ratio_stats.update_metrics(access_time, is_hit, miss_bytes)
+
+    def access(self, trace_record):
+        """
+        Access a trace record. The simulator calls this function to access a
+        trace record.
+        """
+        assert self.used_size <= self.cache_size
+        if (
+            self.enable_cache_row_key > 0
+            and trace_record.caller == 1
+            and trace_record.key_id != 0
+            and trace_record.get_id != 0
+        ):
+            # This is a get request.
+            self._access_row(trace_record)
+            return
+        is_hit = self._access_kv(
+            trace_record,
+            self.block_key(trace_record),
+            trace_record.block_id,
+            trace_record.block_size,
+            trace_record.no_insert,
+        )
+        self._update_stats(
+            trace_record.access_time, is_hit=is_hit, miss_bytes=trace_record.block_size
+        )
+
+    def _access_row(self, trace_record):
+        row_key = self.row_key(trace_record)
+        self.max_seen_get_id = max(self.max_seen_get_id, trace_record.get_id)
+        self.get_id_row_key_map.pop(
+            self.max_seen_get_id - self.retain_get_id_range, None
+        )
+        if trace_record.get_id not in self.get_id_row_key_map:
+            self.get_id_row_key_map[trace_record.get_id] = {}
+            self.get_id_row_key_map[trace_record.get_id]["h"] = False
+        if self.get_id_row_key_map[trace_record.get_id]["h"]:
+            # We treat future accesses as hits since this get request
+            # completes.
+            # print("row hit 1")
+            self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
+            return
+        if row_key not in self.get_id_row_key_map[trace_record.get_id]:
+            # First time seen this key.
+            is_hit = self._access_kv(
+                trace_record,
+                key=row_key,
+                hash=trace_record.key_id,
+                value_size=trace_record.kv_size,
+                no_insert=False,
+            )
+            inserted = False
+            if trace_record.kv_size > 0:
+                inserted = True
+            self.get_id_row_key_map[trace_record.get_id][row_key] = inserted
+            self.get_id_row_key_map[trace_record.get_id]["h"] = is_hit
+        if self.get_id_row_key_map[trace_record.get_id]["h"]:
+            # We treat future accesses as hits since this get request
+            # completes.
+            # print("row hit 2")
+            self._update_stats(trace_record.access_time, is_hit=True, miss_bytes=0)
+            return
+        # Access its blocks.
+        no_insert = trace_record.no_insert
+        if (
+            self.enable_cache_row_key == 2
+            and trace_record.kv_size > 0
+            and trace_record.block_type == 9
+        ):
+            no_insert = True
+        is_hit = self._access_kv(
+            trace_record,
+            key=self.block_key(trace_record),
+            hash=trace_record.block_id,
+            value_size=trace_record.block_size,
+            no_insert=no_insert,
+        )
+        self._update_stats(
+            trace_record.access_time, is_hit, miss_bytes=trace_record.block_size
+        )
+        if (
+            trace_record.kv_size > 0
+            and not self.get_id_row_key_map[trace_record.get_id][row_key]
+        ):
+            # Insert the row key-value pair.
+            self._access_kv(
+                trace_record,
+                key=row_key,
+                hash=trace_record.key_id,
+                value_size=trace_record.kv_size,
+                no_insert=False,
+            )
+            # Mark as inserted.
+            self.get_id_row_key_map[trace_record.get_id][row_key] = True
+
+    def _access_kv(self, trace_record, key, hash, value_size, no_insert):
+        # Sanity checks.
+        assert self.used_size <= self.cache_size
+        if self._lookup(trace_record, key, hash):
+            # A cache hit.
+            return True
+        if no_insert or value_size <= 0:
+            return False
+        # A cache miss.
+        if value_size > self.cache_size:
+            # The block is too large to fit into the cache.
+            return False
+        self._evict(trace_record, key, hash, value_size)
+        if self._should_admit(trace_record, key, hash, value_size):
+            self._insert(trace_record, key, hash, value_size)
+            self.used_size += value_size
+        return False
+
+
+class CostClassEntry:
+    """
+    A cost class maintains aggregated statistics of cached entries in a class.
+    For example, we may define block type as a class. Then, cached blocks of the
+    same type will share one cost class entry.
+    """
+
+    def __init__(self):
+        self.hits = 0
+        self.num_entries_in_cache = 0
+        self.size_in_cache = 0
+        self.sum_insertion_times = 0
+        self.sum_last_access_time = 0
+
+    def insert(self, trace_record, key, value_size):
+        self.size_in_cache += value_size
+        self.num_entries_in_cache += 1
+        self.sum_insertion_times += trace_record.access_time / kMicrosInSecond
+        self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
+
+    def remove(self, insertion_time, last_access_time, key, value_size, num_hits):
+        self.hits -= num_hits
+        self.num_entries_in_cache -= 1
+        self.sum_insertion_times -= insertion_time / kMicrosInSecond
+        self.size_in_cache -= value_size
+        self.sum_last_access_time -= last_access_time / kMicrosInSecond
+
+    def update_on_hit(self, trace_record, last_access_time):
+        self.hits += 1
+        self.sum_last_access_time -= last_access_time / kMicrosInSecond
+        self.sum_last_access_time += trace_record.access_time / kMicrosInSecond
+
+    def avg_lifetime_in_cache(self, now):
+        avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
+        return now / kMicrosInSecond - avg_insertion_time
+
+    def avg_last_access_time(self):
+        if self.num_entries_in_cache == 0:
+            return 0
+        return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
+
+    def avg_size(self):
+        if self.num_entries_in_cache == 0:
+            return 0
+        return float(self.sum_last_access_time) / float(self.num_entries_in_cache)
+
+    def density(self, now):
+        avg_insertion_time = self.sum_insertion_times / self.num_entries_in_cache
+        in_cache_duration = now / kMicrosInSecond - avg_insertion_time
+        return self.size_in_cache * in_cache_duration
+
+
+class MLCache(Cache):
+    """
+    MLCache is the base class for implementations of alternative replacement
+    policies using reinforcement learning.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
+        super(MLCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = HashTable()
+        self.policy_stats = PolicyStats(kSecondsInMinute, policies)
+        self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies)
+        self.policies = policies
+        self.cost_classes = {}
+        self.cost_class_label = cost_class_label
+
+    def is_ml_cache(self):
+        return True
+
+    def _lookup(self, trace_record, key, hash):
+        value = self.table.lookup(key, hash)
+        if value is not None:
+            # Update the entry's cost class statistics.
+            if self.cost_class_label is not None:
+                cost_class = value.cost_class(self.cost_class_label)
+                assert cost_class in self.cost_classes
+                self.cost_classes[cost_class].update_on_hit(
+                    trace_record, value.last_access_time
+                )
+            # Update the entry's last access time.
+            self.table.insert(
+                key,
+                hash,
+                CacheEntry(
+                    value_size=value.value_size,
+                    cf_id=value.cf_id,
+                    level=value.level,
+                    block_type=value.block_type,
+                    table_id=value.table_id,
+                    access_number=self.miss_ratio_stats.num_accesses,
+                    time_s=trace_record.access_time,
+                    num_hits=value.num_hits + 1,
+                ),
+            )
+            return True
+        return False
+
+    def _evict(self, trace_record, key, hash, value_size):
+        # Select a policy, random sample kSampleSize keys from the cache, then
+        # evict keys in the sample set until we have enough room for the new
+        # entry.
+        policy_index = self._select_policy(trace_record, key)
+        assert policy_index < len(self.policies) and policy_index >= 0
+        self.policies[policy_index].delete(key)
+        self.policy_stats.update_metrics(trace_record.access_time, policy_index)
+        self.per_hour_policy_stats.update_metrics(
+            trace_record.access_time, policy_index
+        )
+        while self.used_size + value_size > self.cache_size:
+            # Randomly sample n entries.
+            samples = self.table.random_sample(kSampleSize)
+            samples = self.policies[policy_index].prioritize_samples(
+                samples,
+                [trace_record.access_time, self.cost_classes, self.cost_class_label],
+            )
+            for hash_entry in samples:
+                assert self.table.delete(hash_entry.key, hash_entry.hash) is not None
+                self.used_size -= hash_entry.value.value_size
+                self.policies[policy_index].evict(
+                    key=hash_entry.key, max_size=self.table.elements
+                )
+                # Update the entry's cost class statistics.
+                if self.cost_class_label is not None:
+                    cost_class = hash_entry.value.cost_class(self.cost_class_label)
+                    assert cost_class in self.cost_classes
+                    self.cost_classes[cost_class].remove(
+                        hash_entry.value.insertion_time,
+                        hash_entry.value.last_access_time,
+                        key,
+                        hash_entry.value.value_size,
+                        hash_entry.value.num_hits,
+                    )
+                if self.used_size + value_size <= self.cache_size:
+                    break
+
+    def _insert(self, trace_record, key, hash, value_size):
+        assert self.used_size + value_size <= self.cache_size
+        entry = CacheEntry(
+            value_size,
+            trace_record.cf_id,
+            trace_record.level,
+            trace_record.block_type,
+            trace_record.table_id,
+            self.miss_ratio_stats.num_accesses,
+            trace_record.access_time,
+        )
+        # Update the entry's cost class statistics.
+        if self.cost_class_label is not None:
+            cost_class = entry.cost_class(self.cost_class_label)
+            if cost_class not in self.cost_classes:
+                self.cost_classes[cost_class] = CostClassEntry()
+            self.cost_classes[cost_class].insert(trace_record, key, value_size)
+        self.table.insert(key, hash, entry)
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+    def _select_policy(self, trace_record, key):
+        raise NotImplementedError
+
+
+class ThompsonSamplingCache(MLCache):
+    """
+    An implementation of Thompson Sampling for the Bernoulli Bandit.
+
+    Daniel J. Russo, Benjamin Van Roy, Abbas Kazerouni, Ian Osband,
+    and Zheng Wen. 2018. A Tutorial on Thompson Sampling. Found.
+    Trends Mach. Learn. 11, 1 (July 2018), 1-96.
+    DOI: https://doi.org/10.1561/2200000070
+    """
+
+    def __init__(
+        self,
+        cache_size,
+        enable_cache_row_key,
+        policies,
+        cost_class_label,
+        init_a=1,
+        init_b=1,
+    ):
+        super(ThompsonSamplingCache, self).__init__(
+            cache_size, enable_cache_row_key, policies, cost_class_label
+        )
+        self._as = {}
+        self._bs = {}
+        for _i in range(len(policies)):
+            self._as = [init_a] * len(self.policies)
+            self._bs = [init_b] * len(self.policies)
+
+    def _select_policy(self, trace_record, key):
+        if len(self.policies) == 1:
+            return 0
+        samples = [
+            np.random.beta(self._as[x], self._bs[x]) for x in range(len(self.policies))
+        ]
+        selected_policy = max(range(len(self.policies)), key=lambda x: samples[x])
+        reward = self.policies[selected_policy].generate_reward(key)
+        assert reward <= 1 and reward >= 0
+        self._as[selected_policy] += reward
+        self._bs[selected_policy] += 1 - reward
+        return selected_policy
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid ThompsonSampling with cost class {} (ts_hybrid)".format(
+                self.cost_class_label
+            )
+        return "ThompsonSampling with cost class {} (ts)".format(self.cost_class_label)
+
+
+class LinUCBCache(MLCache):
+    """
+    An implementation of LinUCB with disjoint linear models.
+
+    Lihong Li, Wei Chu, John Langford, and Robert E. Schapire. 2010.
+    A contextual-bandit approach to personalized news article recommendation.
+    In Proceedings of the 19th international conference on World wide web
+    (WWW '10). ACM, New York, NY, USA, 661-670.
+    DOI=http://dx.doi.org/10.1145/1772690.1772758
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label):
+        super(LinUCBCache, self).__init__(
+            cache_size, enable_cache_row_key, policies, cost_class_label
+        )
+        self.nfeatures = 4  # Block type, level, cf.
+        self.th = np.zeros((len(self.policies), self.nfeatures))
+        self.eps = 0.2
+        self.b = np.zeros_like(self.th)
+        self.A = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
+        self.A_inv = np.zeros((len(self.policies), self.nfeatures, self.nfeatures))
+        for i in range(len(self.policies)):
+            self.A[i] = np.identity(self.nfeatures)
+        self.th_hat = np.zeros_like(self.th)
+        self.p = np.zeros(len(self.policies))
+        self.alph = 0.2
+
+    def _select_policy(self, trace_record, key):
+        if len(self.policies) == 1:
+            return 0
+        x_i = np.zeros(self.nfeatures)  # The current context vector
+        x_i[0] = trace_record.block_type
+        x_i[1] = trace_record.level
+        x_i[2] = trace_record.cf_id
+        p = np.zeros(len(self.policies))
+        for a in range(len(self.policies)):
+            self.th_hat[a] = self.A_inv[a].dot(self.b[a])
+            ta = x_i.dot(self.A_inv[a]).dot(x_i)
+            a_upper_ci = self.alph * np.sqrt(ta)
+            a_mean = self.th_hat[a].dot(x_i)
+            p[a] = a_mean + a_upper_ci
+        p = p + (np.random.random(len(p)) * 0.000001)
+        selected_policy = p.argmax()
+        reward = self.policies[selected_policy].generate_reward(key)
+        assert reward <= 1 and reward >= 0
+        self.A[selected_policy] += np.outer(x_i, x_i)
+        self.b[selected_policy] += reward * x_i
+        self.A_inv[selected_policy] = np.linalg.inv(self.A[selected_policy])
+        del x_i
+        return selected_policy
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid LinUCB with cost class {} (linucb_hybrid)".format(
+                self.cost_class_label
+            )
+        return "LinUCB with cost class {} (linucb)".format(self.cost_class_label)
+
+
+class OPTCacheEntry:
+    """
+    A cache entry for the OPT algorithm. The entries are sorted based on its
+    next access sequence number in reverse order, i.e., the entry which next
+    access is the furthest in the future is ordered before other entries.
+    """
+
+    def __init__(self, key, next_access_seq_no, value_size):
+        self.key = key
+        self.next_access_seq_no = next_access_seq_no
+        self.value_size = value_size
+        self.is_removed = False
+
+    def __cmp__(self, other):
+        if other.next_access_seq_no != self.next_access_seq_no:
+            return other.next_access_seq_no - self.next_access_seq_no
+        return self.value_size - other.value_size
+
+    def __repr__(self):
+        return "({} {} {} {})".format(
+            self.key, self.next_access_seq_no, self.value_size, self.is_removed
+        )
+
+
+class PQTable:
+    """
+    A hash table with a priority queue.
+    """
+
+    def __init__(self):
+        # A list of entries arranged in a heap sorted based on the entry custom
+        # implementation of __cmp__
+        self.pq = []
+        self.table = {}
+
+    def pqinsert(self, entry):
+        "Add a new key or update the priority of an existing key"
+        # Remove the entry from the table first.
+        removed_entry = self.table.pop(entry.key, None)
+        if removed_entry:
+            # Mark as removed since there is no 'remove' API in heappq.
+            # Instead, an entry in pq is removed lazily when calling pop.
+            removed_entry.is_removed = True
+        self.table[entry.key] = entry
+        heapq.heappush(self.pq, entry)
+        return removed_entry
+
+    def pqpop(self):
+        while self.pq:
+            entry = heapq.heappop(self.pq)
+            if not entry.is_removed:
+                del self.table[entry.key]
+                return entry
+        return None
+
+    def pqpeek(self):
+        while self.pq:
+            entry = self.pq[0]
+            if not entry.is_removed:
+                return entry
+            heapq.heappop(self.pq)
+        return
+
+    def __contains__(self, k):
+        return k in self.table
+
+    def __getitem__(self, k):
+        return self.table[k]
+
+    def __len__(self):
+        return len(self.table)
+
+    def values(self):
+        return self.table.values()
+
+
+class OPTCache(Cache):
+    """
+    An implementation of the Belady MIN algorithm. OPTCache evicts an entry
+    in the cache whose next access occurs furthest in the future.
+
+    Note that Belady MIN algorithm is optimal assuming all blocks having the
+    same size and a missing entry will be inserted in the cache.
+    These are NOT true for the block cache trace since blocks have different
+    sizes and we may not insert a block into the cache upon a cache miss.
+    However, it is still useful to serve as a "theoretical upper bound" on the
+    lowest miss ratio we can achieve given a cache size.
+
+    L. A. Belady. 1966. A Study of Replacement Algorithms for a
+    Virtual-storage Computer. IBM Syst. J. 5, 2 (June 1966), 78-101.
+    DOI=http://dx.doi.org/10.1147/sj.52.0078
+    """
+
+    def __init__(self, cache_size):
+        super(OPTCache, self).__init__(cache_size, enable_cache_row_key=0)
+        self.table = PQTable()
+
+    def _lookup(self, trace_record, key, hash):
+        if key not in self.table:
+            return False
+        # A cache hit. Update its next access time.
+        assert (
+            self.table.pqinsert(
+                OPTCacheEntry(
+                    key, trace_record.next_access_seq_no, self.table[key].value_size
+                )
+            )
+            is not None
+        )
+        return True
+
+    def _evict(self, trace_record, key, hash, value_size):
+        while self.used_size + value_size > self.cache_size:
+            evict_entry = self.table.pqpop()
+            assert evict_entry is not None
+            self.used_size -= evict_entry.value_size
+
+    def _insert(self, trace_record, key, hash, value_size):
+        assert (
+            self.table.pqinsert(
+                OPTCacheEntry(key, trace_record.next_access_seq_no, value_size)
+            )
+            is None
+        )
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+    def cache_name(self):
+        return "Belady MIN (opt)"
+
+
+class GDSizeEntry:
+    """
+    A cache entry for the greedy dual size replacement policy.
+    """
+
+    def __init__(self, key, value_size, priority):
+        self.key = key
+        self.value_size = value_size
+        self.priority = priority
+        self.is_removed = False
+
+    def __cmp__(self, other):
+        if other.priority != self.priority:
+            return self.priority - other.priority
+        return self.value_size - other.value_size
+
+    def __repr__(self):
+        return "({} {} {} {})".format(
+            self.key, self.next_access_seq_no, self.value_size, self.is_removed
+        )
+
+
+class GDSizeCache(Cache):
+    """
+    An implementation of the greedy dual size algorithm.
+    We define cost as an entry's size.
+
+    See https://www.usenix.org/legacy/publications/library/proceedings/usits97/full_papers/cao/cao_html/node8.html
+    and N. Young. The k-server dual and loose competitiveness for paging.
+    Algorithmica,June 1994, vol. 11,(no.6):525-41.
+    Rewritten version of ''On-line caching as cache size varies'',
+    in The 2nd Annual ACM-SIAM Symposium on Discrete Algorithms, 241-250, 1991.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        super(GDSizeCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = PQTable()
+        self.L = 0.0
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid GreedyDualSize (gdsize_hybrid)"
+        return "GreedyDualSize (gdsize)"
+
+    def _lookup(self, trace_record, key, hash):
+        if key not in self.table:
+            return False
+        # A cache hit. Update its priority.
+        entry = self.table[key]
+        assert (
+            self.table.pqinsert(
+                GDSizeEntry(key, entry.value_size, self.L + entry.value_size)
+            )
+            is not None
+        )
+        return True
+
+    def _evict(self, trace_record, key, hash, value_size):
+        while self.used_size + value_size > self.cache_size:
+            evict_entry = self.table.pqpop()
+            assert evict_entry is not None
+            self.L = evict_entry.priority
+            self.used_size -= evict_entry.value_size
+
+    def _insert(self, trace_record, key, hash, value_size):
+        assert (
+            self.table.pqinsert(GDSizeEntry(key, value_size, self.L + value_size))
+            is None
+        )
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+
+class Deque(object):
+    """A Deque class facilitates the implementation of LRU and ARC."""
+
+    def __init__(self):
+        self.od = OrderedDict()
+
+    def appendleft(self, k):
+        if k in self.od:
+            del self.od[k]
+        self.od[k] = None
+
+    def pop(self):
+        item = self.od.popitem(last=False) if self.od else None
+        if item is not None:
+            return item[0]
+        return None
+
+    def remove(self, k):
+        del self.od[k]
+
+    def __len__(self):
+        return len(self.od)
+
+    def __contains__(self, k):
+        return k in self.od
+
+    def __iter__(self):
+        return reversed(self.od)
+
+    def __repr__(self):
+        return "Deque(%r)" % (list(self),)
+
+
+class ARCCache(Cache):
+    """
+    An implementation of ARC. ARC assumes that all blocks are having the
+    same size. The size of index and filter blocks are variable. To accommodate
+    this, we modified ARC as follows:
+    1) We use 16 KB as the average block size and calculate the number of blocks
+       (c) in the cache.
+    2) When we insert an entry, the cache evicts entries in both t1 and t2
+       queues until it has enough space for the new entry. This also requires
+       modification of the algorithm to maintain a maximum of 2*c blocks.
+
+    Nimrod Megiddo and Dharmendra S. Modha. 2003. ARC: A Self-Tuning, Low
+    Overhead Replacement Cache. In Proceedings of the 2nd USENIX Conference on
+    File and Storage Technologies (FAST '03). USENIX Association, Berkeley, CA,
+    USA, 115-130.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        super(ARCCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = {}
+        self.c = cache_size / 16 * 1024  # Number of elements in the cache.
+        self.p = 0  # Target size for the list T1
+        # L1: only once recently
+        self.t1 = Deque()  # T1: recent cache entries
+        self.b1 = Deque()  # B1: ghost entries recently evicted from the T1 cache
+        # L2: at least twice recently
+        self.t2 = Deque()  # T2: frequent entries
+        self.b2 = Deque()  # B2: ghost entries recently evicted from the T2 cache
+
+    def _replace(self, key, value_size):
+        while self.used_size + value_size > self.cache_size:
+            if self.t1 and ((key in self.b2) or (len(self.t1) > self.p)):
+                old = self.t1.pop()
+                self.b1.appendleft(old)
+            else:
+                if self.t2:
+                    old = self.t2.pop()
+                    self.b2.appendleft(old)
+                else:
+                    old = self.t1.pop()
+                    self.b1.appendleft(old)
+            self.used_size -= self.table[old].value_size
+            del self.table[old]
+
+    def _lookup(self, trace_record, key, hash):
+        # Case I: key is in T1 or T2.
+        #   Move key to MRU position in T2.
+        if key in self.t1:
+            self.t1.remove(key)
+            self.t2.appendleft(key)
+            return True
+
+        if key in self.t2:
+            self.t2.remove(key)
+            self.t2.appendleft(key)
+            return True
+        return False
+
+    def _evict(self, trace_record, key, hash, value_size):
+        # Case II: key is in B1
+        #   Move x from B1 to the MRU position in T2 (also fetch x to the cache).
+        if key in self.b1:
+            self.p = min(self.c, self.p + max(len(self.b2) / len(self.b1), 1))
+            self._replace(key, value_size)
+            self.b1.remove(key)
+            self.t2.appendleft(key)
+            return
+
+        # Case III: key is in B2
+        #   Move x from B2 to the MRU position in T2 (also fetch x to the cache).
+        if key in self.b2:
+            self.p = max(0, self.p - max(len(self.b1) / len(self.b2), 1))
+            self._replace(key, value_size)
+            self.b2.remove(key)
+            self.t2.appendleft(key)
+            return
+
+        # Case IV: key is not in (T1 u B1 u T2 u B2)
+        self._replace(key, value_size)
+        while len(self.t1) + len(self.b1) >= self.c and self.b1:
+            self.b1.pop()
+
+        total = len(self.t1) + len(self.b1) + len(self.t2) + len(self.b2)
+        while total >= (2 * self.c) and self.b2:
+            self.b2.pop()
+            total -= 1
+        # Finally, move it to MRU position in T1.
+        self.t1.appendleft(key)
+        return
+
+    def _insert(self, trace_record, key, hash, value_size):
+        self.table[key] = CacheEntry(
+            value_size,
+            trace_record.cf_id,
+            trace_record.level,
+            trace_record.block_type,
+            trace_record.table_id,
+            0,
+            trace_record.access_time,
+        )
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid Adaptive Replacement Cache (arc_hybrid)"
+        return "Adaptive Replacement Cache (arc)"
+
+
+class LRUCache(Cache):
+    """
+    A strict LRU queue.
+    """
+
+    def __init__(self, cache_size, enable_cache_row_key):
+        super(LRUCache, self).__init__(cache_size, enable_cache_row_key)
+        self.table = {}
+        self.lru = Deque()
+
+    def cache_name(self):
+        if self.enable_cache_row_key:
+            return "Hybrid LRU (lru_hybrid)"
+        return "LRU (lru)"
+
+    def _lookup(self, trace_record, key, hash):
+        if key not in self.table:
+            return False
+        # A cache hit. Update LRU queue.
+        self.lru.remove(key)
+        self.lru.appendleft(key)
+        return True
+
+    def _evict(self, trace_record, key, hash, value_size):
+        while self.used_size + value_size > self.cache_size:
+            evict_key = self.lru.pop()
+            self.used_size -= self.table[evict_key].value_size
+            del self.table[evict_key]
+
+    def _insert(self, trace_record, key, hash, value_size):
+        self.table[key] = CacheEntry(
+            value_size,
+            trace_record.cf_id,
+            trace_record.level,
+            trace_record.block_type,
+            trace_record.table_id,
+            0,
+            trace_record.access_time,
+        )
+        self.lru.appendleft(key)
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return True
+
+
+class TraceCache(Cache):
+    """
+    A trace cache. Lookup returns true if the trace observes a cache hit.
+    It is used to maintain cache hits observed in the trace.
+    """
+
+    def __init__(self, cache_size):
+        super(TraceCache, self).__init__(cache_size, enable_cache_row_key=0)
+
+    def _lookup(self, trace_record, key, hash):
+        return trace_record.is_hit
+
+    def _evict(self, trace_record, key, hash, value_size):
+        pass
+
+    def _insert(self, trace_record, key, hash, value_size):
+        pass
+
+    def _should_admit(self, trace_record, key, hash, value_size):
+        return False
+
+    def cache_name(self):
+        return "Trace"
+
+
+def parse_cache_size(cs):
+    cs = cs.replace("\n", "")
+    if cs[-1] == "M":
+        return int(cs[: len(cs) - 1]) * 1024 * 1024
+    if cs[-1] == "G":
+        return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024
+    if cs[-1] == "T":
+        return int(cs[: len(cs) - 1]) * 1024 * 1024 * 1024 * 1024
+    return int(cs)
+
+
+def create_cache(cache_type, cache_size, downsample_size):
+    cache_size = cache_size / downsample_size
+    enable_cache_row_key = 0
+    if "hybridn" in cache_type:
+        enable_cache_row_key = 2
+        cache_type = cache_type[:-8]
+    if "hybrid" in cache_type:
+        enable_cache_row_key = 1
+        cache_type = cache_type[:-7]
+    if cache_type == "ts":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
+            cost_class_label=None,
+        )
+    elif cache_type == "linucb":
+        return LinUCBCache(
+            cache_size,
+            enable_cache_row_key,
+            [LRUPolicy(), LFUPolicy(), HyperbolicPolicy()],
+            cost_class_label=None,
+        )
+    elif cache_type == "pylru":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [LRUPolicy()], cost_class_label=None
+        )
+    elif cache_type == "pymru":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [MRUPolicy()], cost_class_label=None
+        )
+    elif cache_type == "pylfu":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [LFUPolicy()], cost_class_label=None
+        )
+    elif cache_type == "pyhb":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [HyperbolicPolicy()],
+            cost_class_label=None,
+        )
+    elif cache_type == "pycctbbt":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="table_bt",
+        )
+    elif cache_type == "pycccf":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="cf"
+        )
+    elif cache_type == "pycctblevelbt":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="table_level_bt",
+        )
+    elif cache_type == "pycccfbt":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="cf_bt",
+        )
+    elif cache_type == "pycctb":
+        return ThompsonSamplingCache(
+            cache_size,
+            enable_cache_row_key,
+            [CostClassPolicy()],
+            cost_class_label="table",
+        )
+    elif cache_type == "pyccbt":
+        return ThompsonSamplingCache(
+            cache_size, enable_cache_row_key, [CostClassPolicy()], cost_class_label="bt"
+        )
+    elif cache_type == "opt":
+        if enable_cache_row_key:
+            print("opt does not support hybrid mode.")
+            assert False
+        return OPTCache(cache_size)
+    elif cache_type == "trace":
+        if enable_cache_row_key:
+            print("trace does not support hybrid mode.")
+            assert False
+        return TraceCache(cache_size)
+    elif cache_type == "lru":
+        return LRUCache(cache_size, enable_cache_row_key)
+    elif cache_type == "arc":
+        return ARCCache(cache_size, enable_cache_row_key)
+    elif cache_type == "gdsize":
+        return GDSizeCache(cache_size, enable_cache_row_key)
+    else:
+        print("Unknown cache type {}".format(cache_type))
+        assert False
+    return None
+
+
+class BlockAccessTimeline:
+    """
+    BlockAccessTimeline stores all accesses of a block.
+    """
+
+    def __init__(self):
+        self.accesses = []
+        self.current_access_index = 1
+
+    def get_next_access(self):
+        if self.current_access_index == len(self.accesses):
+            return sys.maxsize
+        next_access_seq_no = self.accesses[self.current_access_index]
+        self.current_access_index += 1
+        return next_access_seq_no
+
+
+def percent(e1, e2):
+    if e2 == 0:
+        return -1
+    return float(e1) * 100.0 / float(e2)
+
+
+def is_target_cf(access_cf, target_cf_name):
+    if target_cf_name == "all":
+        return True
+    return access_cf == target_cf_name
+
+
+def run(
+    trace_file_path,
+    cache_type,
+    cache,
+    warmup_seconds,
+    max_accesses_to_process,
+    target_cf_name,
+):
+    warmup_complete = False
+    trace_miss_ratio_stats = MissRatioStats(kSecondsInMinute)
+    access_seq_no = 0
+    time_interval = 1
+    start_time = time.time()
+    trace_start_time = 0
+    trace_duration = 0
+    is_opt_cache = False
+    if cache.cache_name() == "Belady MIN (opt)":
+        is_opt_cache = True
+
+    block_access_timelines = {}
+    num_no_inserts = 0
+    num_blocks_with_no_size = 0
+    num_inserts_block_with_no_size = 0
+
+    if is_opt_cache:
+        # Read all blocks in memory and stores their access times so that OPT
+        # can use this information to evict the cached key which next access is
+        # the furthest in the future.
+        print("Preprocessing block traces.")
+        with open(trace_file_path, "r") as trace_file:
+            for line in trace_file:
+                if (
+                    max_accesses_to_process != -1
+                    and access_seq_no > max_accesses_to_process
+                ):
+                    break
+                ts = line.split(",")
+                timestamp = int(ts[0])
+                cf_name = ts[5]
+                if not is_target_cf(cf_name, target_cf_name):
+                    continue
+                if trace_start_time == 0:
+                    trace_start_time = timestamp
+                trace_duration = timestamp - trace_start_time
+                block_id = int(ts[1])
+                block_size = int(ts[3])
+                no_insert = int(ts[9])
+                if block_id not in block_access_timelines:
+                    block_access_timelines[block_id] = BlockAccessTimeline()
+                    if block_size == 0:
+                        num_blocks_with_no_size += 1
+                block_access_timelines[block_id].accesses.append(access_seq_no)
+                access_seq_no += 1
+                if no_insert == 1:
+                    num_no_inserts += 1
+                if no_insert == 0 and block_size == 0:
+                    num_inserts_block_with_no_size += 1
+                if access_seq_no % 100 != 0:
+                    continue
+                now = time.time()
+                if now - start_time > time_interval * 10:
+                    print(
+                        "Take {} seconds to process {} trace records with trace "
+                        "duration of {} seconds. Throughput: {} records/second.".format(
+                            now - start_time,
+                            access_seq_no,
+                            trace_duration / 1000000,
+                            access_seq_no / (now - start_time),
+                        )
+                    )
+                    time_interval += 1
+            print(
+                "Trace contains {0} blocks, {1}({2:.2f}%) blocks with no size."
+                "{3} accesses, {4}({5:.2f}%) accesses with no_insert,"
+                "{6}({7:.2f}%) accesses that want to insert but block size is 0.".format(
+                    len(block_access_timelines),
+                    num_blocks_with_no_size,
+                    percent(num_blocks_with_no_size, len(block_access_timelines)),
+                    access_seq_no,
+                    num_no_inserts,
+                    percent(num_no_inserts, access_seq_no),
+                    num_inserts_block_with_no_size,
+                    percent(num_inserts_block_with_no_size, access_seq_no),
+                )
+            )
+
+    access_seq_no = 0
+    time_interval = 1
+    start_time = time.time()
+    trace_start_time = 0
+    trace_duration = 0
+    print("Running simulated {} cache on block traces.".format(cache.cache_name()))
+    with open(trace_file_path, "r") as trace_file:
+        for line in trace_file:
+            if (
+                max_accesses_to_process != -1
+                and access_seq_no > max_accesses_to_process
+            ):
+                break
+            if access_seq_no % 1000000 == 0:
+                # Force a python gc periodically to reduce memory usage.
+                gc.collect()
+            ts = line.split(",")
+            timestamp = int(ts[0])
+            cf_name = ts[5]
+            if not is_target_cf(cf_name, target_cf_name):
+                continue
+            if trace_start_time == 0:
+                trace_start_time = timestamp
+            trace_duration = timestamp - trace_start_time
+            if (
+                not warmup_complete
+                and warmup_seconds > 0
+                and trace_duration > warmup_seconds * 1000000
+            ):
+                cache.miss_ratio_stats.reset_counter()
+                warmup_complete = True
+            next_access_seq_no = 0
+            block_id = int(ts[1])
+            if is_opt_cache:
+                next_access_seq_no = block_access_timelines[block_id].get_next_access()
+            record = TraceRecord(
+                access_time=int(ts[0]),
+                block_id=int(ts[1]),
+                block_type=int(ts[2]),
+                block_size=int(ts[3]),
+                cf_id=int(ts[4]),
+                cf_name=ts[5],
+                level=int(ts[6]),
+                fd=int(ts[7]),
+                caller=int(ts[8]),
+                no_insert=int(ts[9]),
+                get_id=int(ts[10]),
+                key_id=int(ts[11]),
+                kv_size=int(ts[12]),
+                is_hit=int(ts[13]),
+                referenced_key_exist_in_block=int(ts[14]),
+                num_keys_in_block=int(ts[15]),
+                table_id=int(ts[16]),
+                seq_number=int(ts[17]),
+                block_key_size=int(ts[18]),
+                key_size=int(ts[19]),
+                block_offset_in_file=int(ts[20]),
+                next_access_seq_no=next_access_seq_no,
+            )
+            trace_miss_ratio_stats.update_metrics(
+                record.access_time, is_hit=record.is_hit, miss_bytes=record.block_size
+            )
+            cache.access(record)
+            access_seq_no += 1
+            del record
+            del ts
+            if access_seq_no % 100 != 0:
+                continue
+            # Report progress every 10 seconds.
+            now = time.time()
+            if now - start_time > time_interval * 10:
+                print(
+                    "Take {} seconds to process {} trace records with trace "
+                    "duration of {} seconds. Throughput: {} records/second. "
+                    "Trace miss ratio {}".format(
+                        now - start_time,
+                        access_seq_no,
+                        trace_duration / 1000000,
+                        access_seq_no / (now - start_time),
+                        trace_miss_ratio_stats.miss_ratio(),
+                    )
+                )
+                time_interval += 1
+                print(
+                    "{},0,0,{},{},{}".format(
+                        cache_type,
+                        cache.cache_size,
+                        cache.miss_ratio_stats.miss_ratio(),
+                        cache.miss_ratio_stats.num_accesses,
+                    )
+                )
+    now = time.time()
+    print(
+        "Take {} seconds to process {} trace records with trace duration of {} "
+        "seconds. Throughput: {} records/second. Trace miss ratio {}".format(
+            now - start_time,
+            access_seq_no,
+            trace_duration / 1000000,
+            access_seq_no / (now - start_time),
+            trace_miss_ratio_stats.miss_ratio(),
+        )
+    )
+    print(
+        "{},0,0,{},{},{}".format(
+            cache_type,
+            cache.cache_size,
+            cache.miss_ratio_stats.miss_ratio(),
+            cache.miss_ratio_stats.num_accesses,
+        )
+    )
+    return trace_start_time, trace_duration
+
+
+def report_stats(
+    cache,
+    cache_type,
+    cache_size,
+    target_cf_name,
+    result_dir,
+    trace_start_time,
+    trace_end_time,
+):
+    cache_label = "{}-{}-{}".format(cache_type, cache_size, target_cf_name)
+    with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file:
+        mrc_file.write(
+            "{},0,0,{},{},{}\n".format(
+                cache_type,
+                cache_size,
+                cache.miss_ratio_stats.miss_ratio(),
+                cache.miss_ratio_stats.num_accesses,
+            )
+        )
+
+    cache_stats = [
+        cache.per_second_miss_ratio_stats,
+        cache.miss_ratio_stats,
+        cache.per_hour_miss_ratio_stats,
+    ]
+    for i in range(len(cache_stats)):
+        avg_miss_bytes, p95_miss_bytes = cache_stats[i].compute_miss_bytes()
+
+        with open(
+            "{}/data-ml-avgmb-{}-{}".format(
+                result_dir, cache_stats[i].time_unit, cache_label
+            ),
+            "w+",
+        ) as mb_file:
+            mb_file.write(
+                "{},0,0,{},{}\n".format(cache_type, cache_size, avg_miss_bytes)
+            )
+
+        with open(
+            "{}/data-ml-p95mb-{}-{}".format(
+                result_dir, cache_stats[i].time_unit, cache_label
+            ),
+            "w+",
+        ) as mb_file:
+            mb_file.write(
+                "{},0,0,{},{}\n".format(cache_type, cache_size, p95_miss_bytes)
+            )
+
+        cache_stats[i].write_miss_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+        cache_stats[i].write_miss_ratio_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+
+    if not cache.is_ml_cache():
+        return
+
+    policy_stats = [cache.policy_stats, cache.per_hour_policy_stats]
+    for i in range(len(policy_stats)):
+        policy_stats[i].write_policy_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+        policy_stats[i].write_policy_ratio_timeline(
+            cache_type,
+            cache_size,
+            target_cf_name,
+            result_dir,
+            trace_start_time,
+            trace_end_time,
+        )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) <= 8:
+        print(
+            "Must provide 8 arguments.\n"
+            "1) Cache type (ts, linucb, arc, lru, opt, pylru, pymru, pylfu, "
+            "pyhb, gdsize, trace). One may evaluate the hybrid row_block cache "
+            "by appending '_hybrid' to a cache_type, e.g., ts_hybrid. "
+            "Note that hybrid is not supported with opt and trace. \n"
+            "2) Cache size (xM, xG, xT).\n"
+            "3) The sampling frequency used to collect the trace. (The "
+            "simulation scales down the cache size by the sampling frequency).\n"
+            "4) Warmup seconds (The number of seconds used for warmup).\n"
+            "5) Trace file path.\n"
+            "6) Result directory (A directory that saves generated results)\n"
+            "7) Max number of accesses to process\n"
+            "8) The target column family. (The simulation will only run "
+            "accesses on the target column family. If it is set to all, "
+            "it will run against all accesses.)"
+        )
+        exit(1)
+    print("Arguments: {}".format(sys.argv))
+    cache_type = sys.argv[1]
+    cache_size = parse_cache_size(sys.argv[2])
+    downsample_size = int(sys.argv[3])
+    warmup_seconds = int(sys.argv[4])
+    trace_file_path = sys.argv[5]
+    result_dir = sys.argv[6]
+    max_accesses_to_process = int(sys.argv[7])
+    target_cf_name = sys.argv[8]
+    cache = create_cache(cache_type, cache_size, downsample_size)
+    trace_start_time, trace_duration = run(
+        trace_file_path,
+        cache_type,
+        cache,
+        warmup_seconds,
+        max_accesses_to_process,
+        target_cf_name,
+    )
+    trace_end_time = trace_start_time + trace_duration
+    report_stats(
+        cache,
+        cache_type,
+        cache_size,
+        target_cf_name,
+        result_dir,
+        trace_start_time,
+        trace_end_time,
+    )
diff --git a/tools/block_cache_analyzer/block_cache_pysim.sh b/tools/block_cache_analyzer/block_cache_pysim.sh
new file mode 100644
index 00000000000..295f734aa05
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_pysim.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to run a batch of pysims and combine individual pysim output files.
+#
+# Usage: bash block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs
+# trace_file_path: The file path that stores the traces.
+# result_dir: The directory to store pysim results. The output files from a pysim is stores in result_dir/ml
+# downsample_size: The downsample size used to collect the trace.
+# warmup_seconds: The number of seconds used for warmup.
+# max_jobs: The max number of concurrent pysims to run.
+
+# Install required packages to run simulations.
+# sudo dnf install -y numpy scipy python-matplotlib ipython python-pandas sympy python-nose atlas-devel
+ulimit -c 0
+
+if [ $# -ne 5 ]; then
+  echo "Usage: ./block_cache_pysim.sh trace_file_path result_dir downsample_size warmup_seconds max_jobs"
+  exit 0
+fi
+
+trace_file="$1"
+result_dir="$2"
+downsample_size="$3"
+warmup_seconds="$4"
+max_jobs="$5"
+max_num_accesses=100000000
+current_jobs=1
+
+ml_tmp_result_dir="$result_dir/ml"
+rm -rf "$ml_tmp_result_dir"
+mkdir -p "$result_dir"
+mkdir -p "$ml_tmp_result_dir"
+
+# Report miss ratio in the trace.
+current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+for cf_name in "all"
+do
+for cache_size in "1G" "2G" "4G" "8G" "16G" #"12G" "16G" "1T"
+do
+for cache_type in "opt" "lru" "pylru" "pycctbbt" "pyhb" "ts" "trace" "lru_hybrid"  #"pycctblevelbt" #"lru_hybridn" "opt" #"pylru" "pylru_hybrid" "pycctbbt" "pycccfbt" "trace"
+do
+    if [[ $cache_type == "trace" && $cache_size != "16G" ]]; then
+      # We only need to collect miss ratios observed in the trace once.
+      continue
+    fi
+    while [ "$current_jobs" -ge "$max_jobs" ]
+    do
+      sleep 10
+      echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+      current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+      echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+    done
+    output="log-ml-$cache_type-$cache_size-$cf_name"
+    echo "Running simulation for $cache_type, cache size $cache_size, and cf_name $cf_name. Number of running jobs: $current_jobs. "
+    nohup python block_cache_pysim.py "$cache_type" "$cache_size" "$downsample_size" "$warmup_seconds" "$trace_file" "$ml_tmp_result_dir" "$max_num_accesses" "$cf_name" >& "$ml_tmp_result_dir/$output" &
+    current_jobs=$((current_jobs+1))
+done
+done
+done
+
+# Wait for all jobs to complete.
+while [ $current_jobs -gt 0 ]
+do
+  sleep 10
+  echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+  current_jobs=$(ps aux | grep pysim | grep python | grep -cv grep)
+  echo "Waiting jobs to complete. Number of running jobs: $current_jobs"
+done
+
+echo "Combine individual pysim output files"
+
+rm -rf "$result_dir/ml_*"
+for header in "header-" "data-"
+do
+for fn in "$ml_tmp_result_dir"/*
+do
+  sum_file=""
+  time_unit=""
+  capacity=""
+  target_cf_name=""
+  if [[ $fn == *"timeline"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    time_unit_index=0
+    capacity_index=0
+    for i in "${elements[@]}"
+    do
+       if [[ $i == "timeline" ]]; then
+         break
+       fi
+       time_unit_index=$((time_unit_index+1))
+    done
+    time_unit_index=$((time_unit_index+1))
+    capacity_index=$((time_unit_index+2))
+    target_cf_name_index=$((time_unit_index+3))
+    time_unit="${elements[$time_unit_index]}_"
+    capacity="${elements[$capacity_index]}_"
+    target_cf_name="${elements[$target_cf_name_index]}_"
+  fi
+
+  if [[ $fn == *"${header}ml-policy-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_timeline"
+  fi
+  if [[ $fn == *"${header}ml-policy-ratio-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}policy_ratio_timeline"
+  fi
+  if [[ $fn == *"${header}ml-miss-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_timeline"
+  fi
+  if [[ $fn == *"${header}ml-miss-ratio-timeline"* ]]; then
+    sum_file="$result_dir/ml_${target_cf_name}${capacity}${time_unit}miss_ratio_timeline"
+  fi
+  if [[ $fn == *"${header}ml-mrc"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    target_cf_name=${elements[-1]}
+    sum_file="${result_dir}/ml_${target_cf_name}_mrc"
+  fi
+  if [[ $fn == *"${header}ml-avgmb"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    time_unit=${elements[3]}
+    target_cf_name=${elements[-1]}
+    sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_avgmb"
+  fi
+  if [[ $fn == *"${header}ml-p95mb"* ]]; then
+    tmpfn="$fn"
+    IFS='-' read -ra elements <<< "$tmpfn"
+    time_unit=${elements[3]}
+    target_cf_name=${elements[-1]}
+    sum_file="${result_dir}/ml_${time_unit}_${target_cf_name}_p95mb"
+  fi
+  if [[ $sum_file == "" ]]; then
+    continue
+  fi
+  if [[ $header == "header-" ]]; then
+    if [ -e "$sum_file" ]; then
+      continue
+    fi
+  fi
+  cat "$fn" >> "$sum_file"
+done
+done
+
+echo "Done"
+for fn in $result_dir/*
+do
+  if [[ $fn == *"_mrc" || $fn == *"_avgmb" || $fn == *"_p95mb" ]]; then
+    # Sort MRC file by cache_type and cache_size.
+    tmp_file="$result_dir/tmp_mrc"
+    cat "$fn" | sort -t ',' -k1,1 -k4,4n > "$tmp_file"
+    cat "$tmp_file" > "$fn"
+    rm -rf "$tmp_file"
+  fi
+done
diff --git a/tools/block_cache_analyzer/block_cache_pysim_test.py b/tools/block_cache_analyzer/block_cache_pysim_test.py
new file mode 100644
index 00000000000..4b2bdeba656
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_pysim_test.py
@@ -0,0 +1,734 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+import os
+import random
+import sys
+
+from block_cache_pysim import (
+    ARCCache,
+    CacheEntry,
+    GDSizeCache,
+    HashTable,
+    HyperbolicPolicy,
+    LFUPolicy,
+    LinUCBCache,
+    LRUCache,
+    LRUPolicy,
+    MRUPolicy,
+    OPTCache,
+    OPTCacheEntry,
+    ThompsonSamplingCache,
+    TraceCache,
+    TraceRecord,
+    create_cache,
+    kMicrosInSecond,
+    kSampleSize,
+    run,
+)
+
+
+def test_hash_table():
+    print("Test hash table")
+    table = HashTable()
+    data_size = 10000
+    for i in range(data_size):
+        table.insert("k{}".format(i), i, "v{}".format(i))
+    for i in range(data_size):
+        assert table.lookup("k{}".format(i), i) is not None
+    for i in range(data_size):
+        table.delete("k{}".format(i), i)
+    for i in range(data_size):
+        assert table.lookup("k{}".format(i), i) is None
+
+    truth_map = {}
+    n = 1000000
+    records = 100
+    for i in range(n):
+        key_id = random.randint(0, records)
+        v = random.randint(0, records)
+        key = "k{}".format(key_id)
+        value = CacheEntry(v, v, v, v, v, v, v)
+        action = random.randint(0, 10)
+        assert len(truth_map) == table.elements, "{} {} {}".format(
+            len(truth_map), table.elements, i
+        )
+        if action <= 8:
+            if key in truth_map:
+                assert table.lookup(key, key_id) is not None
+                assert truth_map[key].value_size == table.lookup(key, key_id).value_size
+            else:
+                assert table.lookup(key, key_id) is None
+            table.insert(key, key_id, value)
+            truth_map[key] = value
+        else:
+            deleted = table.delete(key, key_id)
+            if deleted:
+                assert key in truth_map
+            if key in truth_map:
+                del truth_map[key]
+
+    # Check all keys are unique in the sample set.
+    for _i in range(10):
+        samples = table.random_sample(kSampleSize)
+        unique_keys = {}
+        for sample in samples:
+            unique_keys[sample.key] = True
+        assert len(samples) == len(unique_keys)
+
+    assert len(table) == len(truth_map)
+    for key in truth_map:
+        assert table.lookup(key, int(key[1:])) is not None
+        assert truth_map[key].value_size == table.lookup(key, int(key[1:])).value_size
+    print("Test hash table: Success")
+
+
+def assert_metrics(cache, expected_value, expected_value_size=1, custom_hashtable=True):
+    assert cache.used_size == expected_value[0], "Expected {}, Actual {}".format(
+        expected_value[0], cache.used_size
+    )
+    assert (
+        cache.miss_ratio_stats.num_accesses == expected_value[1]
+    ), "Expected {}, Actual {}".format(
+        expected_value[1], cache.miss_ratio_stats.num_accesses
+    )
+    assert (
+        cache.miss_ratio_stats.num_misses == expected_value[2]
+    ), "Expected {}, Actual {}".format(
+        expected_value[2], cache.miss_ratio_stats.num_misses
+    )
+    assert len(cache.table) == len(expected_value[3]) + len(
+        expected_value[4]
+    ), "Expected {}, Actual {}".format(
+        len(expected_value[3]) + len(expected_value[4]), cache.table.elements
+    )
+    for expeceted_k in expected_value[3]:
+        if custom_hashtable:
+            val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k)
+        else:
+            val = cache.table["b{}".format(expeceted_k)]
+        assert val is not None, "Expected {} Actual: Not Exist {}, Table: {}".format(
+            expeceted_k, expected_value, cache.table
+        )
+        assert val.value_size == expected_value_size
+    for expeceted_k in expected_value[4]:
+        if custom_hashtable:
+            val = cache.table.lookup("g0-{}".format(expeceted_k), expeceted_k)
+        else:
+            val = cache.table["g0-{}".format(expeceted_k)]
+        assert val is not None
+        assert val.value_size == expected_value_size
+
+
+# Access k1, k1, k2, k3, k3, k3, k4
+# When k4 is inserted,
+#   LRU should evict k1.
+#   LFU should evict k2.
+#   MRU should evict k3.
+def test_cache(cache, expected_value, custom_hashtable=True):
+    k1 = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    k2 = TraceRecord(
+        access_time=1,
+        block_id=2,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    k3 = TraceRecord(
+        access_time=2,
+        block_id=3,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    k4 = TraceRecord(
+        access_time=3,
+        block_id=4,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=5,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    sequence = [k1, k1, k2, k3, k3, k3]
+    index = 0
+    expected_values = []
+    # Access k1, miss.
+    expected_values.append([1, 1, 1, [1], []])
+    # Access k1, hit.
+    expected_values.append([1, 2, 1, [1], []])
+    # Access k2, miss.
+    expected_values.append([2, 3, 2, [1, 2], []])
+    # Access k3, miss.
+    expected_values.append([3, 4, 3, [1, 2, 3], []])
+    # Access k3, hit.
+    expected_values.append([3, 5, 3, [1, 2, 3], []])
+    # Access k3, hit.
+    expected_values.append([3, 6, 3, [1, 2, 3], []])
+    access_time = 0
+    for access in sequence:
+        access.access_time = access_time
+        cache.access(access)
+        assert_metrics(
+            cache,
+            expected_values[index],
+            expected_value_size=1,
+            custom_hashtable=custom_hashtable,
+        )
+        access_time += 1
+        index += 1
+    k4.access_time = access_time
+    cache.access(k4)
+    assert_metrics(
+        cache, expected_value, expected_value_size=1, custom_hashtable=custom_hashtable
+    )
+
+
+def test_lru_cache(cache, custom_hashtable):
+    print("Test LRU cache")
+    # Access k4, miss. evict k1
+    test_cache(cache, [3, 7, 4, [2, 3, 4], []], custom_hashtable)
+    print("Test LRU cache: Success")
+
+
+def test_mru_cache():
+    print("Test MRU cache")
+    policies = []
+    policies.append(MRUPolicy())
+    # Access k4, miss. evict k3
+    test_cache(
+        ThompsonSamplingCache(3, False, policies, cost_class_label=None),
+        [3, 7, 4, [1, 2, 4], []],
+    )
+    print("Test MRU cache: Success")
+
+
+def test_lfu_cache():
+    print("Test LFU cache")
+    policies = []
+    policies.append(LFUPolicy())
+    # Access k4, miss. evict k2
+    test_cache(
+        ThompsonSamplingCache(3, False, policies, cost_class_label=None),
+        [3, 7, 4, [1, 3, 4], []],
+    )
+    print("Test LFU cache: Success")
+
+
+def test_mix(cache):
+    print("Test Mix {} cache".format(cache.cache_name()))
+    n = 100000
+    records = 100
+    block_size_table = {}
+    trace_num_misses = 0
+    for i in range(n):
+        key_id = random.randint(0, records)
+        vs = random.randint(0, 10)
+        now = i * kMicrosInSecond
+        block_size = vs
+        if key_id in block_size_table:
+            block_size = block_size_table[key_id]
+        else:
+            block_size_table[key_id] = block_size
+        is_hit = key_id % 2
+        if is_hit == 0:
+            trace_num_misses += 1
+        k = TraceRecord(
+            access_time=now,
+            block_id=key_id,
+            block_type=1,
+            block_size=block_size,
+            cf_id=0,
+            cf_name="",
+            level=0,
+            fd=0,
+            caller=1,
+            no_insert=0,
+            get_id=key_id,
+            key_id=key_id,
+            kv_size=5,
+            is_hit=is_hit,
+            referenced_key_exist_in_block=1,
+            num_keys_in_block=0,
+            table_id=0,
+            seq_number=0,
+            block_key_size=0,
+            key_size=0,
+            block_offset_in_file=0,
+            next_access_seq_no=vs,
+        )
+        cache.access(k)
+    assert cache.miss_ratio_stats.miss_ratio() > 0
+    if cache.cache_name() == "Trace":
+        assert cache.miss_ratio_stats.num_accesses == n
+        assert cache.miss_ratio_stats.num_misses == trace_num_misses
+    else:
+        assert cache.used_size <= cache.cache_size
+        all_values = cache.table.values()
+        cached_size = 0
+        for value in all_values:
+            cached_size += value.value_size
+        assert cached_size == cache.used_size, "Expeced {} Actual {}".format(
+            cache.used_size, cached_size
+        )
+    print("Test Mix {} cache: Success".format(cache.cache_name()))
+
+
+def test_end_to_end():
+    print("Test All caches")
+    n = 100000
+    nblocks = 1000
+    block_size = 16 * 1024
+    ncfs = 7
+    nlevels = 6
+    nfds = 100000
+    trace_file_path = "test_trace"
+    # All blocks are of the same size so that OPT must achieve the lowest miss
+    # ratio.
+    with open(trace_file_path, "w+") as trace_file:
+        access_records = ""
+        for i in range(n):
+            key_id = random.randint(0, nblocks)
+            cf_id = random.randint(0, ncfs)
+            level = random.randint(0, nlevels)
+            fd = random.randint(0, nfds)
+            now = i * kMicrosInSecond
+            access_record = ""
+            access_record += "{},".format(now)
+            access_record += "{},".format(key_id)
+            access_record += "{},".format(9)  # block type
+            access_record += "{},".format(block_size)  # block size
+            access_record += "{},".format(cf_id)
+            access_record += "cf_{},".format(cf_id)
+            access_record += "{},".format(level)
+            access_record += "{},".format(fd)
+            access_record += "{},".format(key_id % 3)  # caller
+            access_record += "{},".format(0)  # no insert
+            access_record += "{},".format(i)  # get_id
+            access_record += "{},".format(i)  # key_id
+            access_record += "{},".format(100)  # kv_size
+            access_record += "{},".format(1)  # is_hit
+            access_record += "{},".format(1)  # referenced_key_exist_in_block
+            access_record += "{},".format(10)  # num_keys_in_block
+            access_record += "{},".format(1)  # table_id
+            access_record += "{},".format(0)  # seq_number
+            access_record += "{},".format(10)  # block key size
+            access_record += "{},".format(20)  # key size
+            access_record += "{},".format(0)  # block offset
+            access_record = access_record[:-1]
+            access_records += access_record + "\n"
+        trace_file.write(access_records)
+
+    print("Test All caches: Start testing caches")
+    cache_size = block_size * nblocks / 10
+    downsample_size = 1
+    cache_ms = {}
+    for cache_type in [
+        "ts",
+        "opt",
+        "lru",
+        "pylru",
+        "linucb",
+        "gdsize",
+        "pyccbt",
+        "pycctbbt",
+    ]:
+        cache = create_cache(cache_type, cache_size, downsample_size)
+        run(trace_file_path, cache_type, cache, 0, -1, "all")
+        cache_ms[cache_type] = cache
+        assert cache.miss_ratio_stats.num_accesses == n
+
+    for cache_type in cache_ms:
+        cache = cache_ms[cache_type]
+        ms = cache.miss_ratio_stats.miss_ratio()
+        assert ms <= 100.0 and ms >= 0.0
+        # OPT should perform the best.
+        assert cache_ms["opt"].miss_ratio_stats.miss_ratio() <= ms
+        assert cache.used_size <= cache.cache_size
+        all_values = cache.table.values()
+        cached_size = 0
+        for value in all_values:
+            cached_size += value.value_size
+        assert cached_size == cache.used_size, "Expeced {} Actual {}".format(
+            cache.used_size, cached_size
+        )
+        print("Test All {}: Success".format(cache.cache_name()))
+
+    os.remove(trace_file_path)
+    print("Test All: Success")
+
+
+def test_hybrid(cache):
+    print("Test {} cache".format(cache.cache_name()))
+    k = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,  # the first get request.
+        key_id=1,
+        kv_size=0,  # no size.
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=0,
+    )
+    cache.access(k)  # Expect a miss.
+    # used size, num accesses, num misses, hash table size, blocks, get keys.
+    assert_metrics(cache, [1, 1, 1, [1], []])
+    k.access_time += 1
+    k.kv_size = 1
+    k.block_id = 2
+    cache.access(k)  # k should be inserted.
+    assert_metrics(cache, [3, 2, 2, [1, 2], [1]])
+    k.access_time += 1
+    k.block_id = 3
+    cache.access(k)  # k should not be inserted again.
+    assert_metrics(cache, [4, 3, 3, [1, 2, 3], [1]])
+    # A second get request referencing the same key.
+    k.access_time += 1
+    k.get_id = 2
+    k.block_id = 4
+    k.kv_size = 0
+    cache.access(k)  # k should observe a hit. No block access.
+    assert_metrics(cache, [4, 4, 3, [1, 2, 3], [1]])
+
+    # A third get request searches three files, three different keys.
+    # And the second key observes a hit.
+    k.access_time += 1
+    k.kv_size = 1
+    k.get_id = 3
+    k.block_id = 3
+    k.key_id = 2
+    cache.access(k)  # k should observe a miss. block 3 observes a hit.
+    assert_metrics(cache, [5, 5, 3, [1, 2, 3], [1, 2]])
+
+    k.access_time += 1
+    k.kv_size = 1
+    k.get_id = 3
+    k.block_id = 4
+    k.kv_size = 1
+    k.key_id = 1
+    cache.access(k)  # k1 should observe a hit.
+    assert_metrics(cache, [5, 6, 3, [1, 2, 3], [1, 2]])
+
+    k.access_time += 1
+    k.kv_size = 1
+    k.get_id = 3
+    k.block_id = 4
+    k.kv_size = 1
+    k.key_id = 3
+    # k3 should observe a miss.
+    # However, as the get already complete, we should not access k3 any more.
+    cache.access(k)
+    assert_metrics(cache, [5, 7, 3, [1, 2, 3], [1, 2]])
+
+    # A fourth get request searches one file and two blocks. One row key.
+    k.access_time += 1
+    k.get_id = 4
+    k.block_id = 5
+    k.key_id = 4
+    k.kv_size = 1
+    cache.access(k)
+    assert_metrics(cache, [7, 8, 4, [1, 2, 3, 5], [1, 2, 4]])
+
+    # A bunch of insertions which evict cached row keys.
+    for i in range(6, 100):
+        k.access_time += 1
+        k.get_id = 0
+        k.block_id = i
+        cache.access(k)
+
+    k.get_id = 4
+    k.block_id = 100  # A different block.
+    k.key_id = 4  # Same row key and should not be inserted again.
+    k.kv_size = 1
+    cache.access(k)
+    assert_metrics(
+        cache, [kSampleSize, 103, 99, [i for i in range(101 - kSampleSize, 101)], []]
+    )
+    print("Test {} cache: Success".format(cache.cache_name()))
+
+
+def test_opt_cache():
+    print("Test OPT cache")
+    cache = OPTCache(3)
+    # seq:         0,  1,  2,  3,  4,  5,  6,  7,  8
+    # key:         k1, k2, k3, k4, k5, k6, k7, k1, k8
+    # next_access: 7,  19, 18, M,  M,  17, 16, 25, M
+    k = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,  # the first get request.
+        key_id=1,
+        kv_size=0,  # no size.
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=7,
+    )
+    cache.access(k)
+    assert_metrics(
+        cache, [1, 1, 1, [1], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 2
+    k.next_access_seq_no = 19
+    cache.access(k)
+    assert_metrics(
+        cache, [2, 2, 2, [1, 2], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 3
+    k.next_access_seq_no = 18
+    cache.access(k)
+    assert_metrics(
+        cache, [3, 3, 3, [1, 2, 3], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 4
+    k.next_access_seq_no = sys.maxsize  # Never accessed again.
+    cache.access(k)
+    # Evict 2 since its next access 19 is the furthest in the future.
+    assert_metrics(
+        cache, [3, 4, 4, [1, 3, 4], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 5
+    k.next_access_seq_no = sys.maxsize  # Never accessed again.
+    cache.access(k)
+    # Evict 4 since its next access MAXINT is the furthest in the future.
+    assert_metrics(
+        cache, [3, 5, 5, [1, 3, 5], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 6
+    k.next_access_seq_no = 17
+    cache.access(k)
+    # Evict 5 since its next access MAXINT is the furthest in the future.
+    assert_metrics(
+        cache, [3, 6, 6, [1, 3, 6], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 7
+    k.next_access_seq_no = 16
+    cache.access(k)
+    # Evict 3 since its next access 18 is the furthest in the future.
+    assert_metrics(
+        cache, [3, 7, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 1
+    k.next_access_seq_no = 25
+    cache.access(k)
+    assert_metrics(
+        cache, [3, 8, 7, [1, 6, 7], []], expected_value_size=1, custom_hashtable=False
+    )
+    k.access_time += 1
+    k.block_id = 8
+    k.next_access_seq_no = sys.maxsize
+    cache.access(k)
+    # Evict 1 since its next access 25 is the furthest in the future.
+    assert_metrics(
+        cache, [3, 9, 8, [6, 7, 8], []], expected_value_size=1, custom_hashtable=False
+    )
+
+    # Insert a large kv pair to evict all keys.
+    k.access_time += 1
+    k.block_id = 10
+    k.block_size = 3
+    k.next_access_seq_no = sys.maxsize
+    cache.access(k)
+    assert_metrics(
+        cache, [3, 10, 9, [10], []], expected_value_size=3, custom_hashtable=False
+    )
+    print("Test OPT cache: Success")
+
+
+def test_trace_cache():
+    print("Test trace cache")
+    cache = TraceCache(0)
+    k = TraceRecord(
+        access_time=0,
+        block_id=1,
+        block_type=1,
+        block_size=1,
+        cf_id=0,
+        cf_name="",
+        level=0,
+        fd=0,
+        caller=1,
+        no_insert=0,
+        get_id=1,
+        key_id=1,
+        kv_size=0,
+        is_hit=1,
+        referenced_key_exist_in_block=1,
+        num_keys_in_block=0,
+        table_id=0,
+        seq_number=0,
+        block_key_size=0,
+        key_size=0,
+        block_offset_in_file=0,
+        next_access_seq_no=7,
+    )
+    cache.access(k)
+    assert cache.miss_ratio_stats.num_accesses == 1
+    assert cache.miss_ratio_stats.num_misses == 0
+    k.is_hit = 0
+    cache.access(k)
+    assert cache.miss_ratio_stats.num_accesses == 2
+    assert cache.miss_ratio_stats.num_misses == 1
+    print("Test trace cache: Success")
+
+
+if __name__ == "__main__":
+    test_hash_table()
+    test_trace_cache()
+    test_opt_cache()
+    test_lru_cache(
+        ThompsonSamplingCache(
+            3, enable_cache_row_key=0, policies=[LRUPolicy()], cost_class_label=None
+        ),
+        custom_hashtable=True,
+    )
+    test_lru_cache(LRUCache(3, enable_cache_row_key=0), custom_hashtable=False)
+    test_mru_cache()
+    test_lfu_cache()
+    test_hybrid(
+        ThompsonSamplingCache(
+            kSampleSize,
+            enable_cache_row_key=1,
+            policies=[LRUPolicy()],
+            cost_class_label=None,
+        )
+    )
+    test_hybrid(
+        LinUCBCache(
+            kSampleSize,
+            enable_cache_row_key=1,
+            policies=[LRUPolicy()],
+            cost_class_label=None,
+        )
+    )
+    for cache_type in [
+        "ts",
+        "opt",
+        "arc",
+        "pylfu",
+        "pymru",
+        "trace",
+        "pyhb",
+        "lru",
+        "pylru",
+        "linucb",
+        "gdsize",
+        "pycctbbt",
+        "pycctb",
+        "pyccbt",
+    ]:
+        for enable_row_cache in [0, 1, 2]:
+            cache_type_str = cache_type
+            if cache_type != "opt" and cache_type != "trace":
+                if enable_row_cache == 1:
+                    cache_type_str += "_hybrid"
+                elif enable_row_cache == 2:
+                    cache_type_str += "_hybridn"
+            test_mix(create_cache(cache_type_str, cache_size=100, downsample_size=1))
+    test_end_to_end()
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
new file mode 100644
index 00000000000..e6b6a2c0572
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -0,0 +1,2314 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifdef GFLAGS
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <sstream>
+
+#include "monitoring/histogram.h"
+#include "util/gflags_compat.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_string(block_cache_trace_path, "", "The trace file path.");
+DEFINE_bool(is_block_cache_human_readable_trace, false,
+            "Is the trace file provided for analysis generated by running "
+            "block_cache_trace_analyzer with "
+            "FLAGS_human_readable_trace_file_path is specified.");
+DEFINE_string(
+    block_cache_sim_config_path, "",
+    "The config file path. One cache configuration per line. The format of a "
+    "cache configuration is "
+    "cache_name,num_shard_bits,ghost_capacity,cache_capacity_1,...,cache_"
+    "capacity_N. Supported cache names are lru, lru_priority, lru_hybrid, and "
+    "lru_hybrid_no_insert_on_row_miss. User may also add a prefix 'ghost_' to "
+    "a cache_name to add a ghost cache in front of the real cache. "
+    "ghost_capacity and cache_capacity can be xK, xM or xG where x is a "
+    "positive number.");
+DEFINE_int32(block_cache_trace_downsample_ratio, 1,
+             "The trace collected accesses on one in every "
+             "block_cache_trace_downsample_ratio blocks. We scale "
+             "down the simulated cache size by this ratio.");
+DEFINE_bool(print_block_size_stats, false,
+            "Print block size distribution and the distribution break down by "
+            "block type and column family.");
+DEFINE_bool(print_access_count_stats, false,
+            "Print access count distribution and the distribution break down "
+            "by block type and column family.");
+DEFINE_bool(print_data_block_access_count_stats, false,
+            "Print data block accesses by user Get and Multi-Get.");
+DEFINE_int32(cache_sim_warmup_seconds, 0,
+             "The number of seconds to warmup simulated caches. The hit/miss "
+             "counters are reset after the warmup completes.");
+DEFINE_int32(analyze_bottom_k_access_count_blocks, 0,
+             "Print out detailed access information for blocks with their "
+             "number of accesses are the bottom k among all blocks.");
+DEFINE_int32(analyze_top_k_access_count_blocks, 0,
+             "Print out detailed access information for blocks with their "
+             "number of accesses are the top k among all blocks.");
+DEFINE_string(block_cache_analysis_result_dir, "",
+              "The directory that saves block cache analysis results.");
+DEFINE_string(
+    timeline_labels, "",
+    "Group the number of accesses per block per second using these labels. "
+    "Possible labels are a combination of the following: cf (column family), "
+    "sst, level, bt (block type), caller, block. For example, label \"cf_bt\" "
+    "means the number of acccess per second is grouped by unique pairs of "
+    "\"cf_bt\". A label \"all\" contains the aggregated number of accesses per "
+    "second across all possible labels.");
+DEFINE_string(reuse_distance_labels, "",
+              "Group the reuse distance of a block using these labels. Reuse "
+              "distance is defined as the cumulated size of unique blocks read "
+              "between two consecutive accesses on the same block.");
+DEFINE_string(
+    reuse_distance_buckets, "",
+    "Group blocks by their reuse distances given these buckets. For "
+    "example, if 'reuse_distance_buckets' is '1K,1M,1G', we will "
+    "create four buckets. The first three buckets contain the number of "
+    "blocks with reuse distance less than 1KB, between 1K and 1M, between 1M "
+    "and 1G, respectively. The last bucket contains the number of blocks with "
+    "reuse distance larger than 1G. ");
+DEFINE_string(
+    reuse_interval_labels, "",
+    "Group the reuse interval of a block using these labels. Reuse "
+    "interval is defined as the time between two consecutive accesses "
+    "on the same block.");
+DEFINE_string(
+    reuse_interval_buckets, "",
+    "Group blocks by their reuse interval given these buckets. For "
+    "example, if 'reuse_distance_buckets' is '1,10,100', we will "
+    "create four buckets. The first three buckets contain the number of "
+    "blocks with reuse interval less than 1 second, between 1 second and 10 "
+    "seconds, between 10 seconds and 100 seconds, respectively. The last "
+    "bucket contains the number of blocks with reuse interval longer than 100 "
+    "seconds.");
+DEFINE_string(
+    reuse_lifetime_labels, "",
+    "Group the reuse lifetime of a block using these labels. Reuse "
+    "lifetime is defined as the time interval between the first access on a "
+    "block and the last access on the same block. For blocks that are only "
+    "accessed once, its lifetime is set to kMaxUint64.");
+DEFINE_string(
+    reuse_lifetime_buckets, "",
+    "Group blocks by their reuse lifetime given these buckets. For "
+    "example, if 'reuse_lifetime_buckets' is '1,10,100', we will "
+    "create four buckets. The first three buckets contain the number of "
+    "blocks with reuse lifetime less than 1 second, between 1 second and 10 "
+    "seconds, between 10 seconds and 100 seconds, respectively. The last "
+    "bucket contains the number of blocks with reuse lifetime longer than 100 "
+    "seconds.");
+DEFINE_string(
+    analyze_callers, "",
+    "The list of callers to perform a detailed analysis on. If speicfied, the "
+    "analyzer will output a detailed percentage of accesses for each caller "
+    "break down by column family, level, and block type. A list of available "
+    "callers are: Get, MultiGet, Iterator, ApproximateSize, VerifyChecksum, "
+    "SSTDumpTool, ExternalSSTIngestion, Repair, Prefetch, Compaction, "
+    "CompactionRefill, Flush, SSTFileReader, Uncategorized.");
+DEFINE_string(access_count_buckets, "",
+              "Group number of blocks by their access count given these "
+              "buckets. If specified, the analyzer will output a detailed "
+              "analysis on the number of blocks grouped by their access count "
+              "break down by block type and column family.");
+DEFINE_int32(analyze_blocks_reuse_k_reuse_window, 0,
+             "Analyze the percentage of blocks that are accessed in the "
+             "[k, 2*k] seconds are accessed again in the next [2*k, 3*k], "
+             "[3*k, 4*k],...,[k*(n-1), k*n] seconds. ");
+DEFINE_string(analyze_get_spatial_locality_labels, "",
+              "Group data blocks using these labels.");
+DEFINE_string(analyze_get_spatial_locality_buckets, "",
+              "Group data blocks by their statistics using these buckets.");
+DEFINE_string(skew_labels, "",
+              "Group the access count of a block using these labels.");
+DEFINE_string(skew_buckets, "", "Group the skew labels using these buckets.");
+DEFINE_bool(mrc_only, false,
+            "Evaluate alternative cache policies only. When this flag is true, "
+            "the analyzer does NOT maintain states of each block in memory for "
+            "analysis. It only feeds the accesses into the cache simulators.");
+DEFINE_string(
+    analyze_correlation_coefficients_labels, "",
+    "Analyze the correlation coefficients of features such as number of past "
+    "accesses with regard to the number of accesses till the next access.");
+DEFINE_int32(analyze_correlation_coefficients_max_number_of_values, 1000000,
+             "The maximum number of values for a feature. If the number of "
+             "values for a feature is larger than this max, it randomly "
+             "selects 'max' number of values.");
+DEFINE_string(human_readable_trace_file_path, "",
+              "The filt path that saves human readable access records.");
+
+namespace rocksdb {
+namespace {
+
+const std::string kMissRatioCurveFileName = "mrc";
+const std::string kGroupbyBlock = "block";
+const std::string kGroupbyTable = "table";
+const std::string kGroupbyColumnFamily = "cf";
+const std::string kGroupbySSTFile = "sst";
+const std::string kGroupbyBlockType = "bt";
+const std::string kGroupbyCaller = "caller";
+const std::string kGroupbyLevel = "level";
+const std::string kGroupbyAll = "all";
+const std::set<std::string> kGroupbyLabels{
+    kGroupbyBlock,     kGroupbyColumnFamily, kGroupbySSTFile, kGroupbyLevel,
+    kGroupbyBlockType, kGroupbyCaller,       kGroupbyAll};
+const std::string kSupportedCacheNames =
+    " lru ghost_lru lru_priority ghost_lru_priority lru_hybrid "
+    "ghost_lru_hybrid lru_hybrid_no_insert_on_row_miss "
+    "ghost_lru_hybrid_no_insert_on_row_miss ";
+
+// The suffix for the generated csv files.
+const std::string kFileNameSuffixMissRatioTimeline = "miss_ratio_timeline";
+const std::string kFileNameSuffixMissTimeline = "miss_timeline";
+const std::string kFileNameSuffixSkew = "skewness";
+const std::string kFileNameSuffixAccessTimeline = "access_timeline";
+const std::string kFileNameSuffixCorrelation = "correlation_input";
+const std::string kFileNameSuffixAvgReuseIntervalNaccesses =
+    "avg_reuse_interval_naccesses";
+const std::string kFileNameSuffixAvgReuseInterval = "avg_reuse_interval";
+const std::string kFileNameSuffixReuseInterval = "access_reuse_interval";
+const std::string kFileNameSuffixReuseLifetime = "reuse_lifetime";
+const std::string kFileNameSuffixAccessReuseBlocksTimeline =
+    "reuse_blocks_timeline";
+const std::string kFileNameSuffixPercentOfAccessSummary =
+    "percentage_of_accesses_summary";
+const std::string kFileNameSuffixPercentRefKeys = "percent_ref_keys";
+const std::string kFileNameSuffixPercentDataSizeOnRefKeys =
+    "percent_data_size_on_ref_keys";
+const std::string kFileNameSuffixPercentAccessesOnRefKeys =
+    "percent_accesses_on_ref_keys";
+const std::string kFileNameSuffixAccessCountSummary = "access_count_summary";
+
+std::string block_type_to_string(TraceType type) {
+  switch (type) {
+    case kBlockTraceFilterBlock:
+      return "Filter";
+    case kBlockTraceDataBlock:
+      return "Data";
+    case kBlockTraceIndexBlock:
+      return "Index";
+    case kBlockTraceRangeDeletionBlock:
+      return "RangeDeletion";
+    case kBlockTraceUncompressionDictBlock:
+      return "UncompressionDict";
+    default:
+      break;
+  }
+  // This cannot happen.
+  return "InvalidType";
+}
+
+std::string caller_to_string(TableReaderCaller caller) {
+  switch (caller) {
+    case kUserGet:
+      return "Get";
+    case kUserMultiGet:
+      return "MultiGet";
+    case kUserIterator:
+      return "Iterator";
+    case kUserApproximateSize:
+      return "ApproximateSize";
+    case kUserVerifyChecksum:
+      return "VerifyChecksum";
+    case kSSTDumpTool:
+      return "SSTDumpTool";
+    case kExternalSSTIngestion:
+      return "ExternalSSTIngestion";
+    case kRepair:
+      return "Repair";
+    case kPrefetch:
+      return "Prefetch";
+    case kCompaction:
+      return "Compaction";
+    case kCompactionRefill:
+      return "CompactionRefill";
+    case kFlush:
+      return "Flush";
+    case kSSTFileReader:
+      return "SSTFileReader";
+    case kUncategorized:
+      return "Uncategorized";
+    default:
+      break;
+  }
+  // This cannot happen.
+  return "InvalidCaller";
+}
+
+TableReaderCaller string_to_caller(std::string caller_str) {
+  if (caller_str == "Get") {
+    return kUserGet;
+  } else if (caller_str == "MultiGet") {
+    return kUserMultiGet;
+  } else if (caller_str == "Iterator") {
+    return kUserIterator;
+  } else if (caller_str == "ApproximateSize") {
+    return kUserApproximateSize;
+  } else if (caller_str == "VerifyChecksum") {
+    return kUserVerifyChecksum;
+  } else if (caller_str == "SSTDumpTool") {
+    return kSSTDumpTool;
+  } else if (caller_str == "ExternalSSTIngestion") {
+    return kExternalSSTIngestion;
+  } else if (caller_str == "Repair") {
+    return kRepair;
+  } else if (caller_str == "Prefetch") {
+    return kPrefetch;
+  } else if (caller_str == "Compaction") {
+    return kCompaction;
+  } else if (caller_str == "CompactionRefill") {
+    return kCompactionRefill;
+  } else if (caller_str == "Flush") {
+    return kFlush;
+  } else if (caller_str == "SSTFileReader") {
+    return kSSTFileReader;
+  } else if (caller_str == "Uncategorized") {
+    return kUncategorized;
+  }
+  return TableReaderCaller::kMaxBlockCacheLookupCaller;
+}
+
+bool is_user_access(TableReaderCaller caller) {
+  switch (caller) {
+    case kUserGet:
+    case kUserMultiGet:
+    case kUserIterator:
+    case kUserApproximateSize:
+    case kUserVerifyChecksum:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+const char kBreakLine[] =
+    "***************************************************************\n";
+
+void print_break_lines(uint32_t num_break_lines) {
+  for (uint32_t i = 0; i < num_break_lines; i++) {
+    fprintf(stdout, kBreakLine);
+  }
+}
+
+double percent(uint64_t numerator, uint64_t denomenator) {
+  if (denomenator == 0) {
+    return -1;
+  }
+  return static_cast<double>(numerator * 100.0 / denomenator);
+}
+
+std::map<uint64_t, uint64_t> adjust_time_unit(
+    const std::map<uint64_t, uint64_t>& time_stats, uint64_t time_unit) {
+  if (time_unit == 1) {
+    return time_stats;
+  }
+  std::map<uint64_t, uint64_t> adjusted_time_stats;
+  for (auto const& time : time_stats) {
+    adjusted_time_stats[static_cast<uint64_t>(time.first / time_unit)] +=
+        time.second;
+  }
+  return adjusted_time_stats;
+}
+}  // namespace
+
+void BlockCacheTraceAnalyzer::WriteMissRatioCurves() const {
+  if (!cache_simulator_) {
+    return;
+  }
+  if (output_dir_.empty()) {
+    return;
+  }
+  uint64_t trace_duration =
+      trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+  uint64_t total_accesses = access_sequence_number_;
+  const std::string output_miss_ratio_curve_path =
+      output_dir_ + "/" + std::to_string(trace_duration) + "_" +
+      std::to_string(total_accesses) + "_" + kMissRatioCurveFileName;
+  std::ofstream out(output_miss_ratio_curve_path);
+  if (!out.is_open()) {
+    return;
+  }
+  // Write header.
+  const std::string header =
+      "cache_name,num_shard_bits,ghost_capacity,capacity,miss_ratio,total_"
+      "accesses";
+  out << header << std::endl;
+  for (auto const& config_caches : cache_simulator_->sim_caches()) {
+    const CacheConfiguration& config = config_caches.first;
+    for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+      double miss_ratio =
+          config_caches.second[i]->miss_ratio_stats().miss_ratio();
+      // Write the body.
+      out << config.cache_name;
+      out << ",";
+      out << config.num_shard_bits;
+      out << ",";
+      out << config.ghost_cache_capacity;
+      out << ",";
+      out << config.cache_capacities[i];
+      out << ",";
+      out << std::fixed << std::setprecision(4) << miss_ratio;
+      out << ",";
+      out << config_caches.second[i]->miss_ratio_stats().total_accesses();
+      out << std::endl;
+    }
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::UpdateFeatureVectors(
+    const std::vector<uint64_t>& access_sequence_number_timeline,
+    const std::vector<uint64_t>& access_timeline, const std::string& label,
+    std::map<std::string, Features>* label_features,
+    std::map<std::string, Predictions>* label_predictions) const {
+  if (access_sequence_number_timeline.empty() || access_timeline.empty()) {
+    return;
+  }
+  assert(access_timeline.size() == access_sequence_number_timeline.size());
+  uint64_t prev_access_sequence_number = access_sequence_number_timeline[0];
+  uint64_t prev_access_timestamp = access_timeline[0];
+  for (uint32_t i = 0; i < access_sequence_number_timeline.size(); i++) {
+    uint64_t num_accesses_since_last_access =
+        access_sequence_number_timeline[i] - prev_access_sequence_number;
+    uint64_t elapsed_time_since_last_access =
+        access_timeline[i] - prev_access_timestamp;
+    prev_access_sequence_number = access_sequence_number_timeline[i];
+    prev_access_timestamp = access_timeline[i];
+    if (i < access_sequence_number_timeline.size() - 1) {
+      (*label_features)[label].num_accesses_since_last_access.push_back(
+          num_accesses_since_last_access);
+      (*label_features)[label].num_past_accesses.push_back(i);
+      (*label_features)[label].elapsed_time_since_last_access.push_back(
+          elapsed_time_since_last_access);
+    }
+    if (i >= 1) {
+      (*label_predictions)[label].num_accesses_till_next_access.push_back(
+          num_accesses_since_last_access);
+      (*label_predictions)[label].elapsed_time_till_next_access.push_back(
+          elapsed_time_since_last_access);
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteMissRatioTimeline(uint64_t time_unit) const {
+  if (!cache_simulator_ || output_dir_.empty()) {
+    return;
+  }
+  std::map<uint64_t, std::map<std::string, std::map<uint64_t, double>>>
+      cs_name_timeline;
+  uint64_t start_time = port::kMaxUint64;
+  uint64_t end_time = 0;
+  const std::map<uint64_t, uint64_t>& trace_num_misses =
+      adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit);
+  const std::map<uint64_t, uint64_t>& trace_num_accesses =
+      adjust_time_unit(miss_ratio_stats_.num_accesses_timeline(), time_unit);
+  assert(trace_num_misses.size() == trace_num_accesses.size());
+  for (auto const& num_miss : trace_num_misses) {
+    uint64_t time = num_miss.first;
+    start_time = std::min(start_time, time);
+    end_time = std::max(end_time, time);
+    uint64_t miss = num_miss.second;
+    auto it = trace_num_accesses.find(time);
+    assert(it != trace_num_accesses.end());
+    uint64_t access = it->second;
+    cs_name_timeline[port::kMaxUint64]["trace"][time] = percent(miss, access);
+  }
+  for (auto const& config_caches : cache_simulator_->sim_caches()) {
+    const CacheConfiguration& config = config_caches.first;
+    std::string cache_label = config.cache_name + "-" +
+                              std::to_string(config.num_shard_bits) + "-" +
+                              std::to_string(config.ghost_cache_capacity);
+    for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+      const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit(
+          config_caches.second[i]->miss_ratio_stats().num_misses_timeline(),
+          time_unit);
+      const std::map<uint64_t, uint64_t>& num_accesses = adjust_time_unit(
+          config_caches.second[i]->miss_ratio_stats().num_accesses_timeline(),
+          time_unit);
+      assert(num_misses.size() == num_accesses.size());
+      for (auto const& num_miss : num_misses) {
+        uint64_t time = num_miss.first;
+        start_time = std::min(start_time, time);
+        end_time = std::max(end_time, time);
+        uint64_t miss = num_miss.second;
+        auto it = num_accesses.find(time);
+        assert(it != num_accesses.end());
+        uint64_t access = it->second;
+        cs_name_timeline[config.cache_capacities[i]][cache_label][time] =
+            percent(miss, access);
+      }
+    }
+  }
+  for (auto const& it : cs_name_timeline) {
+    const std::string output_miss_ratio_timeline_path =
+        output_dir_ + "/" + std::to_string(it.first) + "_" +
+        std::to_string(time_unit) + "_" + kFileNameSuffixMissRatioTimeline;
+    std::ofstream out(output_miss_ratio_timeline_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("time");
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      header += ",";
+      header += std::to_string(now);
+    }
+    out << header << std::endl;
+    for (auto const& label : it.second) {
+      std::string row(label.first);
+      for (uint64_t now = start_time; now <= end_time; now++) {
+        auto misses = label.second.find(now);
+        row += ",";
+        if (misses != label.second.end()) {
+          row += std::to_string(misses->second);
+        } else {
+          row += "0";
+        }
+      }
+      out << row << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteMissTimeline(uint64_t time_unit) const {
+  if (!cache_simulator_ || output_dir_.empty()) {
+    return;
+  }
+  std::map<uint64_t, std::map<std::string, std::map<uint64_t, uint64_t>>>
+      cs_name_timeline;
+  uint64_t start_time = port::kMaxUint64;
+  uint64_t end_time = 0;
+  const std::map<uint64_t, uint64_t>& trace_num_misses =
+      adjust_time_unit(miss_ratio_stats_.num_misses_timeline(), time_unit);
+  for (auto const& num_miss : trace_num_misses) {
+    uint64_t time = num_miss.first;
+    start_time = std::min(start_time, time);
+    end_time = std::max(end_time, time);
+    uint64_t miss = num_miss.second;
+    cs_name_timeline[port::kMaxUint64]["trace"][time] = miss;
+  }
+  for (auto const& config_caches : cache_simulator_->sim_caches()) {
+    const CacheConfiguration& config = config_caches.first;
+    std::string cache_label = config.cache_name + "-" +
+                              std::to_string(config.num_shard_bits) + "-" +
+                              std::to_string(config.ghost_cache_capacity);
+    for (uint32_t i = 0; i < config.cache_capacities.size(); i++) {
+      const std::map<uint64_t, uint64_t>& num_misses = adjust_time_unit(
+          config_caches.second[i]->miss_ratio_stats().num_misses_timeline(),
+          time_unit);
+      for (auto const& num_miss : num_misses) {
+        uint64_t time = num_miss.first;
+        start_time = std::min(start_time, time);
+        end_time = std::max(end_time, time);
+        uint64_t miss = num_miss.second;
+        cs_name_timeline[config.cache_capacities[i]][cache_label][time] = miss;
+      }
+    }
+  }
+  for (auto const& it : cs_name_timeline) {
+    const std::string output_miss_ratio_timeline_path =
+        output_dir_ + "/" + std::to_string(it.first) + "_" +
+        std::to_string(time_unit) + "_" + kFileNameSuffixMissTimeline;
+    std::ofstream out(output_miss_ratio_timeline_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("time");
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      header += ",";
+      header += std::to_string(now);
+    }
+    out << header << std::endl;
+    for (auto const& label : it.second) {
+      std::string row(label.first);
+      for (uint64_t now = start_time; now <= end_time; now++) {
+        auto misses = label.second.find(now);
+        row += ",";
+        if (misses != label.second.end()) {
+          row += std::to_string(misses->second);
+        } else {
+          row += "0";
+        }
+      }
+      out << row << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteSkewness(
+    const std::string& label_str, const std::vector<uint64_t>& percent_buckets,
+    TraceType target_block_type) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, uint64_t> label_naccesses;
+  uint64_t total_naccesses = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    if (target_block_type != TraceType::kTraceMax &&
+        target_block_type != type) {
+      return;
+    }
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+    label_naccesses[label] += block.num_accesses;
+    total_naccesses += block.num_accesses;
+  };
+  TraverseBlocks(block_callback, &labels);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_bucket_naccesses;
+  std::vector<std::pair<std::string, uint64_t>> pairs;
+  for (auto const& itr : label_naccesses) {
+    pairs.push_back(itr);
+  }
+  // Sort in descending order.
+  sort(
+      pairs.begin(), pairs.end(),
+      [=](std::pair<std::string, uint64_t>& a,
+          std::pair<std::string, uint64_t>& b) { return b.second < a.second; });
+
+  size_t prev_start_index = 0;
+  for (auto const& percent : percent_buckets) {
+    label_bucket_naccesses[label_str][percent] = 0;
+    size_t end_index = 0;
+    if (percent == port::kMaxUint64) {
+      end_index = label_naccesses.size();
+    } else {
+      end_index = percent * label_naccesses.size() / 100;
+    }
+    for (size_t i = prev_start_index; i < end_index; i++) {
+      label_bucket_naccesses[label_str][percent] += pairs[i].second;
+    }
+    prev_start_index = end_index;
+  }
+  std::string filename_suffix;
+  if (target_block_type != TraceType::kTraceMax) {
+    filename_suffix = block_type_to_string(target_block_type);
+    filename_suffix += "_";
+  }
+  filename_suffix += kFileNameSuffixSkew;
+  WriteStatsToFile(label_str, percent_buckets, filename_suffix,
+                   label_bucket_naccesses, total_naccesses);
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeatures(
+    const std::string& label_str, uint32_t max_number_of_values) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, Features> label_features;
+  std::map<std::string, Predictions> label_predictions;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t fd, uint32_t level,
+          TraceType block_type, const std::string& /*block_key*/,
+          uint64_t /*block_key_id*/, const BlockAccessInfo& block) {
+        if (block.table_id == 0 && labels.find(kGroupbyTable) != labels.end()) {
+          // We only know table id information for get requests.
+          return;
+        }
+        if (labels.find(kGroupbyCaller) != labels.end()) {
+          // Group by caller.
+          for (auto const& caller_map : block.caller_access_timeline) {
+            const std::string label =
+                BuildLabel(labels, cf_name, fd, level, block_type,
+                           caller_map.first, /*block_id=*/0, block);
+            auto it = block.caller_access_sequence__number_timeline.find(
+                caller_map.first);
+            assert(it != block.caller_access_sequence__number_timeline.end());
+            UpdateFeatureVectors(it->second, caller_map.second, label,
+                                 &label_features, &label_predictions);
+          }
+          return;
+        }
+        const std::string label =
+            BuildLabel(labels, cf_name, fd, level, block_type,
+                       TableReaderCaller::kMaxBlockCacheLookupCaller,
+                       /*block_id=*/0, block);
+        UpdateFeatureVectors(block.access_sequence_number_timeline,
+                             block.access_timeline, label, &label_features,
+                             &label_predictions);
+      };
+  TraverseBlocks(block_callback, &labels);
+  WriteCorrelationFeaturesToFile(label_str, label_features, label_predictions,
+                                 max_number_of_values);
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesToFile(
+    const std::string& label,
+    const std::map<std::string, Features>& label_features,
+    const std::map<std::string, Predictions>& label_predictions,
+    uint32_t max_number_of_values) const {
+  std::default_random_engine rand_engine(static_cast<std::default_random_engine::result_type>(env_->NowMicros()));
+  for (auto const& label_feature_vectors : label_features) {
+    const Features& past = label_feature_vectors.second;
+    auto it = label_predictions.find(label_feature_vectors.first);
+    assert(it != label_predictions.end());
+    const Predictions& future = it->second;
+    const std::string output_path = output_dir_ + "/" + label + "_" +
+                                    label_feature_vectors.first + "_" +
+                                    kFileNameSuffixCorrelation;
+    std::ofstream out(output_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header(
+        "num_accesses_since_last_access,elapsed_time_since_last_access,num_"
+        "past_accesses,num_accesses_till_next_access,elapsed_time_till_next_"
+        "access");
+    out << header << std::endl;
+    std::vector<uint32_t> indexes;
+    for (uint32_t i = 0; i < past.num_accesses_since_last_access.size(); i++) {
+      indexes.push_back(i);
+    }
+    std::shuffle(indexes.begin(), indexes.end(), rand_engine);
+    for (uint32_t i = 0; i < max_number_of_values && i < indexes.size(); i++) {
+      uint32_t rand_index = indexes[i];
+      out << std::to_string(past.num_accesses_since_last_access[rand_index])
+          << ",";
+      out << std::to_string(past.elapsed_time_since_last_access[rand_index])
+          << ",";
+      out << std::to_string(past.num_past_accesses[rand_index]) << ",";
+      out << std::to_string(future.num_accesses_till_next_access[rand_index])
+          << ",";
+      out << std::to_string(future.elapsed_time_till_next_access[rand_index])
+          << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteCorrelationFeaturesForGet(
+    uint32_t max_number_of_values) const {
+  std::string label = "GetKeyInfo";
+  std::map<std::string, Features> label_features;
+  std::map<std::string, Predictions> label_predictions;
+  for (auto const& get_info : get_key_info_map_) {
+    const GetKeyInfo& info = get_info.second;
+    UpdateFeatureVectors(info.access_sequence_number_timeline,
+                         info.access_timeline, label, &label_features,
+                         &label_predictions);
+  }
+  WriteCorrelationFeaturesToFile(label, label_features, label_predictions,
+                                 max_number_of_values);
+}
+
+std::set<std::string> BlockCacheTraceAnalyzer::ParseLabelStr(
+    const std::string& label_str) const {
+  std::stringstream ss(label_str);
+  std::set<std::string> labels;
+  // label_str is in the form of "label1_label2_label3", e.g., cf_bt.
+  while (ss.good()) {
+    std::string label_name;
+    getline(ss, label_name, '_');
+    if (kGroupbyLabels.find(label_name) == kGroupbyLabels.end()) {
+      // Unknown label name.
+      fprintf(stderr, "Unknown label name %s, label string %s\n",
+              label_name.c_str(), label_str.c_str());
+      return {};
+    }
+    labels.insert(label_name);
+  }
+  return labels;
+}
+
+std::string BlockCacheTraceAnalyzer::BuildLabel(
+    const std::set<std::string>& labels, const std::string& cf_name,
+    uint64_t fd, uint32_t level, TraceType type, TableReaderCaller caller,
+    uint64_t block_key, const BlockAccessInfo& block) const {
+  std::map<std::string, std::string> label_value_map;
+  label_value_map[kGroupbyAll] = kGroupbyAll;
+  label_value_map[kGroupbyLevel] = std::to_string(level);
+  label_value_map[kGroupbyCaller] = caller_to_string(caller);
+  label_value_map[kGroupbySSTFile] = std::to_string(fd);
+  label_value_map[kGroupbyBlockType] = block_type_to_string(type);
+  label_value_map[kGroupbyColumnFamily] = cf_name;
+  label_value_map[kGroupbyBlock] = std::to_string(block_key);
+  label_value_map[kGroupbyTable] = std::to_string(block.table_id);
+  // Concatenate the label values.
+  std::string label;
+  for (auto const& l : labels) {
+    label += label_value_map[l];
+    label += "-";
+  }
+  if (!label.empty()) {
+    label.pop_back();
+  }
+  return label;
+}
+
+void BlockCacheTraceAnalyzer::TraverseBlocks(
+    std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/,
+                       uint32_t /*level*/, TraceType /*block_type*/,
+                       const std::string& /*block_key*/,
+                       uint64_t /*block_key_id*/,
+                       const BlockAccessInfo& /*block_access_info*/)>
+        block_callback,
+    std::set<std::string>* labels) const {
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      const uint64_t fd = file_aggregates.first;
+      const uint32_t level = file_aggregates.second.level;
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          if (labels && block_access_info.second.table_id == 0 &&
+              labels->find(kGroupbyTable) != labels->end()) {
+            // We only know table id information for get requests.
+            return;
+          }
+          block_callback(cf_name, fd, level, type, block_access_info.first,
+                         block_access_info.second.block_id,
+                         block_access_info.second);
+        }
+      }
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteGetSpatialLocality(
+    const std::string& label_str,
+    const std::vector<uint64_t>& percent_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefkeys_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_pnrefs_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_pndatasize_nblocks;
+  uint64_t nblocks = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType /*block_type*/,
+                            const std::string& /*block_key*/,
+                            uint64_t /*block_key_id*/,
+                            const BlockAccessInfo& block) {
+    if (block.num_keys == 0) {
+      return;
+    }
+    uint64_t naccesses = 0;
+    for (auto const& key_access : block.key_num_access_map) {
+      for (auto const& caller_access : key_access.second) {
+        if (caller_access.first == TableReaderCaller::kUserGet) {
+          naccesses += caller_access.second;
+        }
+      }
+    }
+    const std::string label =
+        BuildLabel(labels, cf_name, fd, level, TraceType::kBlockTraceDataBlock,
+                   TableReaderCaller::kUserGet, /*block_id=*/0, block);
+
+    const uint64_t percent_referenced_for_existing_keys =
+        static_cast<uint64_t>(std::max(
+            percent(block.key_num_access_map.size(), block.num_keys), 0.0));
+    const uint64_t percent_accesses_for_existing_keys =
+        static_cast<uint64_t>(std::max(
+            percent(block.num_referenced_key_exist_in_block, naccesses), 0.0));
+    const uint64_t percent_referenced_data_size = static_cast<uint64_t>(
+        std::max(percent(block.referenced_data_size, block.block_size), 0.0));
+    if (label_pnrefkeys_nblocks.find(label) == label_pnrefkeys_nblocks.end()) {
+      for (auto const& percent_bucket : percent_buckets) {
+        label_pnrefkeys_nblocks[label][percent_bucket] = 0;
+        label_pnrefs_nblocks[label][percent_bucket] = 0;
+        label_pndatasize_nblocks[label][percent_bucket] = 0;
+      }
+    }
+    label_pnrefkeys_nblocks[label]
+        .upper_bound(percent_referenced_for_existing_keys)
+        ->second += 1;
+    label_pnrefs_nblocks[label]
+        .upper_bound(percent_accesses_for_existing_keys)
+        ->second += 1;
+    label_pndatasize_nblocks[label]
+        .upper_bound(percent_referenced_data_size)
+        ->second += 1;
+    nblocks += 1;
+  };
+  TraverseBlocks(block_callback, &labels);
+  WriteStatsToFile(label_str, percent_buckets, kFileNameSuffixPercentRefKeys,
+                   label_pnrefkeys_nblocks, nblocks);
+  WriteStatsToFile(label_str, percent_buckets,
+                   kFileNameSuffixPercentAccessesOnRefKeys,
+                   label_pnrefs_nblocks, nblocks);
+  WriteStatsToFile(label_str, percent_buckets,
+                   kFileNameSuffixPercentDataSizeOnRefKeys,
+                   label_pndatasize_nblocks, nblocks);
+}
+
+void BlockCacheTraceAnalyzer::WriteAccessTimeline(const std::string& label_str,
+                                                  uint64_t time_unit,
+                                                  bool user_access_only) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  uint64_t start_time = port::kMaxUint64;
+  uint64_t end_time = 0;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_access_timeline;
+  std::map<uint64_t, std::vector<std::string>> access_count_block_id_map;
+
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    uint64_t naccesses = 0;
+    for (auto const& timeline : block.caller_num_accesses_timeline) {
+      const TableReaderCaller caller = timeline.first;
+      if (user_access_only && !is_user_access(caller)) {
+        continue;
+      }
+      const std::string label =
+          BuildLabel(labels, cf_name, fd, level, type, caller, block_id, block);
+      for (auto const& naccess : timeline.second) {
+        const uint64_t timestamp = naccess.first / time_unit;
+        const uint64_t num = naccess.second;
+        label_access_timeline[label][timestamp] += num;
+        start_time = std::min(start_time, timestamp);
+        end_time = std::max(end_time, timestamp);
+        naccesses += num;
+      }
+    }
+    if (naccesses > 0) {
+      access_count_block_id_map[naccesses].push_back(std::to_string(block_id));
+    }
+  };
+  TraverseBlocks(block_callback, &labels);
+
+  // We have label_access_timeline now. Write them into a file.
+  const std::string user_access_prefix =
+      user_access_only ? "user_access_only_" : "all_access_";
+  const std::string output_path = output_dir_ + "/" + user_access_prefix +
+                                  label_str + "_" + std::to_string(time_unit) +
+                                  "_" + kFileNameSuffixAccessTimeline;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("time");
+  if (labels.find("block") != labels.end()) {
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      header += ",";
+      header += std::to_string(now);
+    }
+    out << header << std::endl;
+    // Write the most frequently accessed blocks first.
+    for (auto naccess_it = access_count_block_id_map.rbegin();
+         naccess_it != access_count_block_id_map.rend(); naccess_it++) {
+      for (auto& block_id_it : naccess_it->second) {
+        std::string row(block_id_it);
+        for (uint64_t now = start_time; now <= end_time; now++) {
+          auto it = label_access_timeline[block_id_it].find(now);
+          row += ",";
+          if (it != label_access_timeline[block_id_it].end()) {
+            row += std::to_string(it->second);
+          } else {
+            row += "0";
+          }
+        }
+        out << row << std::endl;
+      }
+    }
+    out.close();
+    return;
+  }
+  for (uint64_t now = start_time; now <= end_time; now++) {
+    header += ",";
+    header += std::to_string(now);
+  }
+  out << header << std::endl;
+  for (auto const& label : label_access_timeline) {
+    std::string row(label.first);
+    for (uint64_t now = start_time; now <= end_time; now++) {
+      auto it = label.second.find(now);
+      row += ",";
+      if (it != label.second.end()) {
+        row += std::to_string(it->second);
+      } else {
+        row += "0";
+      }
+    }
+    out << row << std::endl;
+  }
+
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseDistance(
+    const std::string& label_str,
+    const std::vector<uint64_t>& distance_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_distance_num_reuses;
+  uint64_t total_num_reuses = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+    if (label_distance_num_reuses.find(label) ==
+        label_distance_num_reuses.end()) {
+      // The first time we encounter this label.
+      for (auto const& distance_bucket : distance_buckets) {
+        label_distance_num_reuses[label][distance_bucket] = 0;
+      }
+    }
+    for (auto const& reuse_distance : block.reuse_distance_count) {
+      label_distance_num_reuses[label]
+          .upper_bound(reuse_distance.first)
+          ->second += reuse_distance.second;
+      total_num_reuses += reuse_distance.second;
+    }
+  };
+  TraverseBlocks(block_callback, &labels);
+  // We have label_naccesses and label_distance_num_reuses now. Write them into
+  // a file.
+  const std::string output_path =
+      output_dir_ + "/" + label_str + "_reuse_distance";
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("bucket");
+  for (auto const& label_it : label_distance_num_reuses) {
+    header += ",";
+    header += label_it.first;
+  }
+  out << header << std::endl;
+  for (auto const& bucket : distance_buckets) {
+    std::string row(std::to_string(bucket));
+    for (auto const& label_it : label_distance_num_reuses) {
+      auto const& it = label_it.second.find(bucket);
+      assert(it != label_it.second.end());
+      row += ",";
+      row += std::to_string(percent(it->second, total_num_reuses));
+    }
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::UpdateReuseIntervalStats(
+    const std::string& label, const std::vector<uint64_t>& time_buckets,
+    const std::map<uint64_t, uint64_t> timeline,
+    std::map<std::string, std::map<uint64_t, uint64_t>>* label_time_num_reuses,
+    uint64_t* total_num_reuses) const {
+  assert(label_time_num_reuses);
+  assert(total_num_reuses);
+  if (label_time_num_reuses->find(label) == label_time_num_reuses->end()) {
+    // The first time we encounter this label.
+    for (auto const& time_bucket : time_buckets) {
+      (*label_time_num_reuses)[label][time_bucket] = 0;
+    }
+  }
+  auto it = timeline.begin();
+  uint64_t prev_timestamp = it->first;
+  const uint64_t prev_num = it->second;
+  it++;
+  // Reused within one second.
+  if (prev_num > 1) {
+    (*label_time_num_reuses)[label].upper_bound(0)->second += prev_num - 1;
+    *total_num_reuses += prev_num - 1;
+  }
+  while (it != timeline.end()) {
+    const uint64_t timestamp = it->first;
+    const uint64_t num = it->second;
+    const uint64_t reuse_interval = timestamp - prev_timestamp;
+    (*label_time_num_reuses)[label].upper_bound(reuse_interval)->second += 1;
+    if (num > 1) {
+      (*label_time_num_reuses)[label].upper_bound(0)->second += num - 1;
+    }
+    prev_timestamp = timestamp;
+    *total_num_reuses += num;
+    it++;
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteStatsToFile(
+    const std::string& label_str, const std::vector<uint64_t>& time_buckets,
+    const std::string& filename_suffix,
+    const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data,
+    uint64_t ntotal) const {
+  const std::string output_path =
+      output_dir_ + "/" + label_str + "_" + filename_suffix;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("bucket");
+  for (auto const& label_it : label_data) {
+    header += ",";
+    header += label_it.first;
+  }
+  out << header << std::endl;
+  for (auto const& bucket : time_buckets) {
+    std::string row(std::to_string(bucket));
+    for (auto const& label_it : label_data) {
+      auto const& it = label_it.second.find(bucket);
+      assert(it != label_it.second.end());
+      row += ",";
+      row += std::to_string(percent(it->second, ntotal));
+    }
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseInterval(
+    const std::string& label_str,
+    const std::vector<uint64_t>& time_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_time_num_reuses;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_avg_reuse_naccesses;
+
+  uint64_t total_num_reuses = 0;
+  uint64_t total_nblocks = 0;
+  uint64_t total_accesses = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    total_nblocks++;
+    total_accesses += block.num_accesses;
+    uint64_t avg_reuse_interval = 0;
+    if (block.num_accesses > 1) {
+      avg_reuse_interval = ((block.last_access_time - block.first_access_time) /
+                            kMicrosInSecond) /
+                           block.num_accesses;
+    } else {
+      avg_reuse_interval = port::kMaxUint64 - 1;
+    }
+    if (labels.find(kGroupbyCaller) != labels.end()) {
+      for (auto const& timeline : block.caller_num_accesses_timeline) {
+        const TableReaderCaller caller = timeline.first;
+        const std::string label = BuildLabel(labels, cf_name, fd, level, type,
+                                             caller, block_id, block);
+        UpdateReuseIntervalStats(label, time_buckets, timeline.second,
+                                 &label_time_num_reuses, &total_num_reuses);
+      }
+      return;
+    }
+    // Does not group by caller so we need to flatten the access timeline.
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+    std::map<uint64_t, uint64_t> timeline;
+    for (auto const& caller_timeline : block.caller_num_accesses_timeline) {
+      for (auto const& time_naccess : caller_timeline.second) {
+        timeline[time_naccess.first] += time_naccess.second;
+      }
+    }
+    UpdateReuseIntervalStats(label, time_buckets, timeline,
+                             &label_time_num_reuses, &total_num_reuses);
+    if (label_avg_reuse_nblocks.find(label) == label_avg_reuse_nblocks.end()) {
+      for (auto const& time_bucket : time_buckets) {
+        label_avg_reuse_nblocks[label][time_bucket] = 0;
+        label_avg_reuse_naccesses[label][time_bucket] = 0;
+      }
+    }
+    label_avg_reuse_nblocks[label].upper_bound(avg_reuse_interval)->second += 1;
+    label_avg_reuse_naccesses[label].upper_bound(avg_reuse_interval)->second +=
+        block.num_accesses;
+  };
+  TraverseBlocks(block_callback, &labels);
+
+  // Write the stats into files.
+  WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseInterval,
+                   label_time_num_reuses, total_num_reuses);
+  WriteStatsToFile(label_str, time_buckets, kFileNameSuffixAvgReuseInterval,
+                   label_avg_reuse_nblocks, total_nblocks);
+  WriteStatsToFile(label_str, time_buckets,
+                   kFileNameSuffixAvgReuseIntervalNaccesses,
+                   label_avg_reuse_naccesses, total_accesses);
+}
+
+void BlockCacheTraceAnalyzer::WriteReuseLifetime(
+    const std::string& label_str,
+    const std::vector<uint64_t>& time_buckets) const {
+  std::set<std::string> labels = ParseLabelStr(label_str);
+  std::map<std::string, std::map<uint64_t, uint64_t>> label_lifetime_nblocks;
+  uint64_t total_nblocks = 0;
+  auto block_callback = [&](const std::string& cf_name, uint64_t fd,
+                            uint32_t level, TraceType type,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    uint64_t lifetime = 0;
+    if (block.num_accesses > 1) {
+      lifetime =
+          (block.last_access_time - block.first_access_time) / kMicrosInSecond;
+    } else {
+      lifetime = port::kMaxUint64 - 1;
+    }
+    const std::string label = BuildLabel(
+        labels, cf_name, fd, level, type,
+        TableReaderCaller::kMaxBlockCacheLookupCaller, block_id, block);
+
+    if (label_lifetime_nblocks.find(label) == label_lifetime_nblocks.end()) {
+      // The first time we encounter this label.
+      for (auto const& time_bucket : time_buckets) {
+        label_lifetime_nblocks[label][time_bucket] = 0;
+      }
+    }
+    label_lifetime_nblocks[label].upper_bound(lifetime)->second += 1;
+    total_nblocks += 1;
+  };
+  TraverseBlocks(block_callback, &labels);
+  WriteStatsToFile(label_str, time_buckets, kFileNameSuffixReuseLifetime,
+                   label_lifetime_nblocks, total_nblocks);
+}
+
+void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline(
+    const uint64_t reuse_window, bool user_access_only, TraceType block_type) const {
+  // A map from block key to an array of bools that states whether a block is
+  // accessed in a time window.
+  std::map<uint64_t, std::vector<bool>> block_accessed;
+  const uint64_t trace_duration =
+      trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+  const uint64_t reuse_vector_size = (trace_duration / reuse_window);
+  if (reuse_vector_size < 2) {
+    // The reuse window is less than 2. We cannot calculate the reused
+    // percentage of blocks.
+    return;
+  }
+  auto block_callback = [&](const std::string& /*cf_name*/, uint64_t /*fd*/,
+                            uint32_t /*level*/, TraceType /*type*/,
+                            const std::string& /*block_key*/, uint64_t block_id,
+                            const BlockAccessInfo& block) {
+    if (block_accessed.find(block_id) == block_accessed.end()) {
+      block_accessed[block_id].resize(reuse_vector_size);
+      for (uint64_t i = 0; i < reuse_vector_size; i++) {
+        block_accessed[block_id][i] = false;
+      }
+    }
+    for (auto const& caller_num : block.caller_num_accesses_timeline) {
+      const TableReaderCaller caller = caller_num.first;
+      for (auto const& timeline : caller_num.second) {
+        const uint64_t timestamp = timeline.first;
+        const uint64_t elapsed_time =
+            timestamp - trace_start_timestamp_in_seconds_;
+        if (!user_access_only || (user_access_only && is_user_access(caller))) {
+          uint64_t index =
+              std::min(elapsed_time / reuse_window, reuse_vector_size - 1);
+          block_accessed[block_id][index] = true;
+        }
+      }
+    }
+  };
+  TraverseBlocks(block_callback);
+
+  // A cell is the number of blocks accessed in a reuse window.
+  std::unique_ptr<uint64_t[]> reuse_table(new uint64_t[reuse_vector_size * reuse_vector_size]);
+  for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+    // Initialize the reuse_table.
+    for (uint64_t i = 0; i < reuse_vector_size; i++) {
+      reuse_table[start_time * reuse_vector_size + i] = 0;
+    }
+    // Examine all blocks.
+    for (auto const& block : block_accessed) {
+      for (uint64_t i = start_time; i < reuse_vector_size; i++) {
+        if (block.second[start_time] && block.second[i]) {
+          // This block is accessed at start time and at the current time. We
+          // increment reuse_table[start_time][i] since it is reused at the ith
+          // window.
+          reuse_table[start_time * reuse_vector_size + i]++;
+        }
+      }
+    }
+  }
+  const std::string user_access_prefix =
+      user_access_only ? "_user_access_only_" : "_all_access_";
+  const std::string output_path =
+      output_dir_ + "/" + block_type_to_string(block_type) +
+      user_access_prefix + std::to_string(reuse_window) + "_" +
+      kFileNameSuffixAccessReuseBlocksTimeline;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("start_time");
+  for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+    header += ",";
+    header += std::to_string(start_time);
+  }
+  out << header << std::endl;
+  for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) {
+    std::string row(std::to_string(start_time * reuse_window));
+    for (uint64_t j = 0; j < reuse_vector_size; j++) {
+      row += ",";
+      if (j < start_time) {
+        row += "100.0";
+      } else {
+        row += std::to_string(percent(reuse_table[start_time * reuse_vector_size + j],
+                                      reuse_table[start_time * reuse_vector_size + start_time]));
+      }
+    }
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+std::string BlockCacheTraceAnalyzer::OutputPercentAccessStats(
+    uint64_t total_accesses,
+    const std::map<std::string, uint64_t>& cf_access_count) const {
+  std::string row;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    const std::string& cf_name = cf_aggregates.first;
+    const auto& naccess = cf_access_count.find(cf_name);
+    row += ",";
+    if (naccess != cf_access_count.end()) {
+      row += std::to_string(percent(naccess->second, total_accesses));
+    } else {
+      row += "0";
+    }
+  }
+  return row;
+}
+
+void BlockCacheTraceAnalyzer::WritePercentAccessSummaryStats() const {
+  std::map<TableReaderCaller, std::map<std::string, uint64_t>>
+      caller_cf_accesses;
+  uint64_t total_accesses = 0;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType /*type*/, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        for (auto const& caller_num : block.caller_num_access_map) {
+          const TableReaderCaller caller = caller_num.first;
+          const uint64_t naccess = caller_num.second;
+          caller_cf_accesses[caller][cf_name] += naccess;
+          total_accesses += naccess;
+        }
+      };
+  TraverseBlocks(block_callback);
+
+  const std::string output_path =
+      output_dir_ + "/" + kFileNameSuffixPercentOfAccessSummary;
+  std::ofstream out(output_path);
+  if (!out.is_open()) {
+    return;
+  }
+  std::string header("caller");
+  for (auto const& cf_name : cf_aggregates_map_) {
+    header += ",";
+    header += cf_name.first;
+  }
+  out << header << std::endl;
+  for (auto const& cf_naccess_it : caller_cf_accesses) {
+    const TableReaderCaller caller = cf_naccess_it.first;
+    std::string row;
+    row += caller_to_string(caller);
+    row += OutputPercentAccessStats(total_accesses, cf_naccess_it.second);
+    out << row << std::endl;
+  }
+  out.close();
+}
+
+void BlockCacheTraceAnalyzer::WriteDetailedPercentAccessSummaryStats(
+    TableReaderCaller analyzing_caller) const {
+  std::map<uint32_t, std::map<std::string, uint64_t>> level_cf_accesses;
+  std::map<TraceType, std::map<std::string, uint64_t>> bt_cf_accesses;
+  uint64_t total_accesses = 0;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t level,
+          TraceType type, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        for (auto const& caller_num : block.caller_num_access_map) {
+          const TableReaderCaller caller = caller_num.first;
+          if (caller == analyzing_caller) {
+            const uint64_t naccess = caller_num.second;
+            level_cf_accesses[level][cf_name] += naccess;
+            bt_cf_accesses[type][cf_name] += naccess;
+            total_accesses += naccess;
+          }
+        }
+      };
+  TraverseBlocks(block_callback);
+  {
+    const std::string output_path =
+        output_dir_ + "/" + caller_to_string(analyzing_caller) + "_level_" +
+        kFileNameSuffixPercentOfAccessSummary;
+    std::ofstream out(output_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("level");
+    for (auto const& cf_name : cf_aggregates_map_) {
+      header += ",";
+      header += cf_name.first;
+    }
+    out << header << std::endl;
+    for (auto const& level_naccess_it : level_cf_accesses) {
+      const uint32_t level = level_naccess_it.first;
+      std::string row;
+      row += std::to_string(level);
+      row += OutputPercentAccessStats(total_accesses, level_naccess_it.second);
+      out << row << std::endl;
+    }
+    out.close();
+  }
+  {
+    const std::string output_path =
+        output_dir_ + "/" + caller_to_string(analyzing_caller) + "_bt_" +
+        kFileNameSuffixPercentOfAccessSummary;
+    std::ofstream out(output_path);
+    if (!out.is_open()) {
+      return;
+    }
+    std::string header("bt");
+    for (auto const& cf_name : cf_aggregates_map_) {
+      header += ",";
+      header += cf_name.first;
+    }
+    out << header << std::endl;
+    for (auto const& bt_naccess_it : bt_cf_accesses) {
+      const TraceType bt = bt_naccess_it.first;
+      std::string row;
+      row += block_type_to_string(bt);
+      row += OutputPercentAccessStats(total_accesses, bt_naccess_it.second);
+      out << row << std::endl;
+    }
+    out.close();
+  }
+}
+
+void BlockCacheTraceAnalyzer::WriteAccessCountSummaryStats(
+    const std::vector<uint64_t>& access_count_buckets,
+    bool user_access_only) const {
+  // x: buckets.
+  // y: # of accesses.
+  std::map<std::string, std::map<uint64_t, uint64_t>> bt_access_nblocks;
+  std::map<std::string, std::map<uint64_t, uint64_t>> cf_access_nblocks;
+  uint64_t total_nblocks = 0;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType type, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        const std::string type_str = block_type_to_string(type);
+        if (cf_access_nblocks.find(cf_name) == cf_access_nblocks.end()) {
+          // initialize.
+          for (auto& access : access_count_buckets) {
+            cf_access_nblocks[cf_name][access] = 0;
+          }
+        }
+        if (bt_access_nblocks.find(type_str) == bt_access_nblocks.end()) {
+          // initialize.
+          for (auto& access : access_count_buckets) {
+            bt_access_nblocks[type_str][access] = 0;
+          }
+        }
+        uint64_t naccesses = 0;
+        for (auto const& caller_access : block.caller_num_access_map) {
+          if (!user_access_only ||
+              (user_access_only && is_user_access(caller_access.first))) {
+            naccesses += caller_access.second;
+          }
+        }
+        if (naccesses == 0) {
+          return;
+        }
+        total_nblocks += 1;
+        bt_access_nblocks[type_str].upper_bound(naccesses)->second += 1;
+        cf_access_nblocks[cf_name].upper_bound(naccesses)->second += 1;
+      };
+  TraverseBlocks(block_callback);
+  const std::string user_access_prefix =
+      user_access_only ? "user_access_only_" : "all_access_";
+  WriteStatsToFile("cf", access_count_buckets,
+                   user_access_prefix + kFileNameSuffixAccessCountSummary,
+                   cf_access_nblocks, total_nblocks);
+  WriteStatsToFile("bt", access_count_buckets,
+                   user_access_prefix + kFileNameSuffixAccessCountSummary,
+                   bt_access_nblocks, total_nblocks);
+}
+
+BlockCacheTraceAnalyzer::BlockCacheTraceAnalyzer(
+    const std::string& trace_file_path, const std::string& output_dir,
+    const std::string& human_readable_trace_file_path,
+    bool compute_reuse_distance, bool mrc_only,
+    bool is_human_readable_trace_file,
+    std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator)
+    : env_(rocksdb::Env::Default()),
+      trace_file_path_(trace_file_path),
+      output_dir_(output_dir),
+      human_readable_trace_file_path_(human_readable_trace_file_path),
+      compute_reuse_distance_(compute_reuse_distance),
+      mrc_only_(mrc_only),
+      is_human_readable_trace_file_(is_human_readable_trace_file),
+      cache_simulator_(std::move(cache_simulator)) {}
+
+void BlockCacheTraceAnalyzer::ComputeReuseDistance(
+    BlockAccessInfo* info) const {
+  assert(info);
+  if (info->num_accesses == 0) {
+    return;
+  }
+  uint64_t reuse_distance = 0;
+  for (auto const& block_key : info->unique_blocks_since_last_access) {
+    auto const& it = block_info_map_.find(block_key);
+    // This block must exist.
+    assert(it != block_info_map_.end());
+    reuse_distance += it->second->block_size;
+  }
+  info->reuse_distance_count[reuse_distance] += 1;
+  // We clear this hash set since this is the second access on this block.
+  info->unique_blocks_since_last_access.clear();
+}
+
+Status BlockCacheTraceAnalyzer::RecordAccess(
+    const BlockCacheTraceRecord& access) {
+  ColumnFamilyAccessInfoAggregate& cf_aggr = cf_aggregates_map_[access.cf_name];
+  SSTFileAccessInfoAggregate& file_aggr =
+      cf_aggr.fd_aggregates_map[access.sst_fd_number];
+  file_aggr.level = access.level;
+  BlockTypeAccessInfoAggregate& block_type_aggr =
+      file_aggr.block_type_aggregates_map[access.block_type];
+  if (block_type_aggr.block_access_info_map.find(access.block_key) ==
+      block_type_aggr.block_access_info_map.end()) {
+    block_type_aggr.block_access_info_map[access.block_key].block_id =
+        unique_block_id_;
+    unique_block_id_++;
+  }
+  BlockAccessInfo& block_access_info =
+      block_type_aggr.block_access_info_map[access.block_key];
+  if (compute_reuse_distance_) {
+    ComputeReuseDistance(&block_access_info);
+  }
+  block_access_info.AddAccess(access, access_sequence_number_);
+  block_info_map_[access.block_key] = &block_access_info;
+  uint64_t get_key_id = 0;
+  if (access.caller == TableReaderCaller::kUserGet &&
+      access.get_id != BlockCacheTraceHelper::kReservedGetId) {
+    std::string user_key = ExtractUserKey(access.referenced_key).ToString();
+    if (get_key_info_map_.find(user_key) == get_key_info_map_.end()) {
+      get_key_info_map_[user_key].key_id = unique_get_key_id_;
+      unique_get_key_id_++;
+    }
+    get_key_id = get_key_info_map_[user_key].key_id;
+    get_key_info_map_[user_key].AddAccess(access, access_sequence_number_);
+  }
+
+  if (compute_reuse_distance_) {
+    // Add this block to all existing blocks.
+    for (auto& cf_aggregates : cf_aggregates_map_) {
+      for (auto& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+        for (auto& block_type_aggregates :
+             file_aggregates.second.block_type_aggregates_map) {
+          for (auto& existing_block :
+               block_type_aggregates.second.block_access_info_map) {
+            existing_block.second.unique_blocks_since_last_access.insert(
+                access.block_key);
+          }
+        }
+      }
+    }
+  }
+  return human_readable_trace_writer_.WriteHumanReadableTraceRecord(
+      access, block_access_info.block_id, get_key_id);
+}
+
+Status BlockCacheTraceAnalyzer::Analyze() {
+  std::unique_ptr<BlockCacheTraceReader> reader;
+  Status s = Status::OK();
+  if (is_human_readable_trace_file_) {
+    reader.reset(new BlockCacheHumanReadableTraceReader(trace_file_path_));
+  } else {
+    std::unique_ptr<TraceReader> trace_reader;
+    s = NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader);
+    if (!s.ok()) {
+      return s;
+    }
+    reader.reset(new BlockCacheTraceReader(std::move(trace_reader)));
+    s = reader->ReadHeader(&header_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  if (!human_readable_trace_file_path_.empty()) {
+    s = human_readable_trace_writer_.NewWritableFile(
+        human_readable_trace_file_path_, env_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  uint64_t start = env_->NowMicros();
+  uint64_t time_interval = 0;
+  while (s.ok()) {
+    BlockCacheTraceRecord access;
+    s = reader->ReadAccess(&access);
+    if (!s.ok()) {
+      break;
+    }
+    if (!mrc_only_) {
+      s = RecordAccess(access);
+      if (!s.ok()) {
+        break;
+      }
+    }
+    if (trace_start_timestamp_in_seconds_ == 0) {
+      trace_start_timestamp_in_seconds_ =
+          access.access_timestamp / kMicrosInSecond;
+    }
+    trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond;
+    miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+                                    is_user_access(access.caller),
+                                    access.is_cache_hit == Boolean::kFalse);
+    if (cache_simulator_) {
+      cache_simulator_->Access(access);
+    }
+    access_sequence_number_++;
+    uint64_t now = env_->NowMicros();
+    uint64_t duration = (now - start) / kMicrosInSecond;
+    if (duration > 10 * time_interval) {
+      uint64_t trace_duration =
+          trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+      fprintf(stdout,
+              "Running for %" PRIu64 " seconds: Processed %" PRIu64
+              " records/second. Trace duration %" PRIu64
+              " seconds. Observed miss ratio %.2f\n",
+              duration, duration > 0 ? access_sequence_number_ / duration : 0,
+              trace_duration, miss_ratio_stats_.miss_ratio());
+      time_interval++;
+    }
+  }
+  uint64_t now = env_->NowMicros();
+  uint64_t duration = (now - start) / kMicrosInSecond;
+  uint64_t trace_duration =
+      trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
+  fprintf(stdout,
+          "Running for %" PRIu64 " seconds: Processed %" PRIu64
+          " records/second. Trace duration %" PRIu64
+          " seconds. Observed miss ratio %.2f\n",
+          duration, duration > 0 ? access_sequence_number_ / duration : 0,
+          trace_duration, miss_ratio_stats_.miss_ratio());
+  return s;
+}
+
+void BlockCacheTraceAnalyzer::PrintBlockSizeStats() const {
+  HistogramStat bs_stats;
+  std::map<TraceType, HistogramStat> bt_stats_map;
+  std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType type, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        if (block.block_size == 0) {
+          // Block size may be 0 when 1) compaction observes a cache miss and
+          // does not insert the missing block into the cache again. 2)
+          // fetching filter blocks in SST files at the last level.
+          return;
+        }
+        bs_stats.Add(block.block_size);
+        bt_stats_map[type].Add(block.block_size);
+        cf_bt_stats_map[cf_name][type].Add(block.block_size);
+      };
+  TraverseBlocks(block_callback);
+  fprintf(stdout, "Block size stats: \n%s", bs_stats.ToString().c_str());
+  for (auto const& bt_stats : bt_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Block size stats for block type %s: \n%s",
+            block_type_to_string(bt_stats.first).c_str(),
+            bt_stats.second.ToString().c_str());
+  }
+  for (auto const& cf_bt_stats : cf_bt_stats_map) {
+    const std::string& cf_name = cf_bt_stats.first;
+    for (auto const& bt_stats : cf_bt_stats.second) {
+      print_break_lines(/*num_break_lines=*/1);
+      fprintf(stdout,
+              "Block size stats for column family %s and block type %s: \n%s",
+              cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
+              bt_stats.second.ToString().c_str());
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintAccessCountStats(bool user_access_only,
+                                                    uint32_t bottom_k,
+                                                    uint32_t top_k) const {
+  HistogramStat access_stats;
+  std::map<TraceType, HistogramStat> bt_stats_map;
+  std::map<std::string, std::map<TraceType, HistogramStat>> cf_bt_stats_map;
+  std::map<uint64_t, std::vector<std::string>> access_count_blocks;
+  auto block_callback = [&](const std::string& cf_name, uint64_t /*fd*/,
+                            uint32_t /*level*/, TraceType type,
+                            const std::string& block_key, uint64_t /*block_id*/,
+                            const BlockAccessInfo& block) {
+    uint64_t naccesses = 0;
+    for (auto const& caller_access : block.caller_num_access_map) {
+      if (!user_access_only ||
+          (user_access_only && is_user_access(caller_access.first))) {
+        naccesses += caller_access.second;
+      }
+    }
+    if (naccesses == 0) {
+      return;
+    }
+    if (type == TraceType::kBlockTraceDataBlock) {
+      access_count_blocks[naccesses].push_back(block_key);
+    }
+    access_stats.Add(naccesses);
+    bt_stats_map[type].Add(naccesses);
+    cf_bt_stats_map[cf_name][type].Add(naccesses);
+  };
+  TraverseBlocks(block_callback);
+  fprintf(stdout,
+          "Block access count stats: The number of accesses per block. %s\n%s",
+          user_access_only ? "User accesses only" : "All accesses",
+          access_stats.ToString().c_str());
+  uint32_t bottom_k_index = 0;
+  for (auto naccess_it = access_count_blocks.begin();
+       naccess_it != access_count_blocks.end(); naccess_it++) {
+    bottom_k_index++;
+    if (bottom_k_index >= bottom_k) {
+      break;
+    }
+    std::map<TableReaderCaller, uint64_t> caller_naccesses;
+    uint64_t naccesses = 0;
+    for (auto const& block_id : naccess_it->second) {
+      BlockAccessInfo* block = block_info_map_.find(block_id)->second;
+      for (auto const& caller_access : block->caller_num_access_map) {
+        if (!user_access_only ||
+            (user_access_only && is_user_access(caller_access.first))) {
+          caller_naccesses[caller_access.first] += caller_access.second;
+          naccesses += caller_access.second;
+        }
+      }
+    }
+    std::string statistics("Caller:");
+    for (auto const& caller_naccessess_it : caller_naccesses) {
+      statistics += caller_to_string(caller_naccessess_it.first);
+      statistics += ":";
+      statistics +=
+          std::to_string(percent(caller_naccessess_it.second, naccesses));
+      statistics += ",";
+    }
+    fprintf(stdout,
+            "Bottom %" PRIu32 " access count. Access count=%" PRIu64
+            " nblocks=%" ROCKSDB_PRIszt " %s\n",
+            bottom_k, naccess_it->first, naccess_it->second.size(),
+            statistics.c_str());
+  }
+
+  uint32_t top_k_index = 0;
+  for (auto naccess_it = access_count_blocks.rbegin();
+       naccess_it != access_count_blocks.rend(); naccess_it++) {
+    top_k_index++;
+    if (top_k_index >= top_k) {
+      break;
+    }
+    for (auto const& block_id : naccess_it->second) {
+      BlockAccessInfo* block = block_info_map_.find(block_id)->second;
+      std::string statistics("Caller:");
+      uint64_t naccesses = 0;
+      for (auto const& caller_access : block->caller_num_access_map) {
+        if (!user_access_only ||
+            (user_access_only && is_user_access(caller_access.first))) {
+          naccesses += caller_access.second;
+        }
+      }
+      assert(naccesses > 0);
+      for (auto const& caller_access : block->caller_num_access_map) {
+        if (!user_access_only ||
+            (user_access_only && is_user_access(caller_access.first))) {
+          statistics += ",";
+          statistics += caller_to_string(caller_access.first);
+          statistics += ":";
+          statistics +=
+              std::to_string(percent(caller_access.second, naccesses));
+        }
+      }
+      uint64_t ref_keys_accesses = 0;
+      uint64_t ref_keys_does_not_exist_accesses = 0;
+      for (auto const& ref_key_caller_access : block->key_num_access_map) {
+        for (auto const& caller_access : ref_key_caller_access.second) {
+          if (!user_access_only ||
+              (user_access_only && is_user_access(caller_access.first))) {
+            ref_keys_accesses += caller_access.second;
+          }
+        }
+      }
+      for (auto const& ref_key_caller_access :
+           block->non_exist_key_num_access_map) {
+        for (auto const& caller_access : ref_key_caller_access.second) {
+          if (!user_access_only ||
+              (user_access_only && is_user_access(caller_access.first))) {
+            ref_keys_does_not_exist_accesses += caller_access.second;
+          }
+        }
+      }
+      statistics += ",nkeys=";
+      statistics += std::to_string(block->num_keys);
+      statistics += ",block_size=";
+      statistics += std::to_string(block->block_size);
+      statistics += ",num_ref_keys=";
+      statistics += std::to_string(block->key_num_access_map.size());
+      statistics += ",percent_access_ref_keys=";
+      statistics += std::to_string(percent(ref_keys_accesses, naccesses));
+      statistics += ",num_ref_keys_does_not_exist=";
+      statistics += std::to_string(block->non_exist_key_num_access_map.size());
+      statistics += ",percent_access_ref_keys_does_not_exist=";
+      statistics +=
+          std::to_string(percent(ref_keys_does_not_exist_accesses, naccesses));
+      statistics += ",ref_data_size=";
+      statistics += std::to_string(block->referenced_data_size);
+      fprintf(stdout,
+              "Top %" PRIu32 " access count blocks access_count=%" PRIu64
+              " %s\n",
+              top_k, naccess_it->first, statistics.c_str());
+    }
+  }
+
+  for (auto const& bt_stats : bt_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Break down by block type %s: \n%s",
+            block_type_to_string(bt_stats.first).c_str(),
+            bt_stats.second.ToString().c_str());
+  }
+  for (auto const& cf_bt_stats : cf_bt_stats_map) {
+    const std::string& cf_name = cf_bt_stats.first;
+    for (auto const& bt_stats : cf_bt_stats.second) {
+      print_break_lines(/*num_break_lines=*/1);
+      fprintf(stdout,
+              "Break down by column family %s and block type "
+              "%s: \n%s",
+              cf_name.c_str(), block_type_to_string(bt_stats.first).c_str(),
+              bt_stats.second.ToString().c_str());
+    }
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const {
+  HistogramStat existing_keys_stats;
+  std::map<std::string, HistogramStat> cf_existing_keys_stats_map;
+  HistogramStat non_existing_keys_stats;
+  std::map<std::string, HistogramStat> cf_non_existing_keys_stats_map;
+  HistogramStat block_access_stats;
+  std::map<std::string, HistogramStat> cf_block_access_info;
+  HistogramStat percent_referenced_bytes;
+  std::map<std::string, HistogramStat> cf_percent_referenced_bytes;
+  // Total number of accesses in a data block / number of keys in a data block.
+  HistogramStat avg_naccesses_per_key_in_a_data_block;
+  std::map<std::string, HistogramStat> cf_avg_naccesses_per_key_in_a_data_block;
+  // The standard deviation on the number of accesses of a key in a data block.
+  HistogramStat stdev_naccesses_per_key_in_a_data_block;
+  std::map<std::string, HistogramStat>
+      cf_stdev_naccesses_per_key_in_a_data_block;
+  auto block_callback =
+      [&](const std::string& cf_name, uint64_t /*fd*/, uint32_t /*level*/,
+          TraceType /*type*/, const std::string& /*block_key*/,
+          uint64_t /*block_id*/, const BlockAccessInfo& block) {
+        if (block.num_keys == 0) {
+          return;
+        }
+        // Use four decimal points.
+        uint64_t percent_referenced_for_existing_keys = (uint64_t)(
+            ((double)block.key_num_access_map.size() / (double)block.num_keys) *
+            10000.0);
+        uint64_t percent_referenced_for_non_existing_keys =
+            (uint64_t)(((double)block.non_exist_key_num_access_map.size() /
+                        (double)block.num_keys) *
+                       10000.0);
+        uint64_t percent_accesses_for_existing_keys =
+            (uint64_t)(((double)block.num_referenced_key_exist_in_block /
+                        (double)block.num_accesses) *
+                       10000.0);
+
+        HistogramStat hist_naccess_per_key;
+        for (auto const& key_access : block.key_num_access_map) {
+          for (auto const& caller_access : key_access.second) {
+            hist_naccess_per_key.Add(caller_access.second);
+          }
+        }
+        uint64_t avg_accesses =
+            static_cast<uint64_t>(hist_naccess_per_key.Average());
+        uint64_t stdev_accesses =
+            static_cast<uint64_t>(hist_naccess_per_key.StandardDeviation());
+        avg_naccesses_per_key_in_a_data_block.Add(avg_accesses);
+        cf_avg_naccesses_per_key_in_a_data_block[cf_name].Add(avg_accesses);
+        stdev_naccesses_per_key_in_a_data_block.Add(stdev_accesses);
+        cf_stdev_naccesses_per_key_in_a_data_block[cf_name].Add(stdev_accesses);
+
+        existing_keys_stats.Add(percent_referenced_for_existing_keys);
+        cf_existing_keys_stats_map[cf_name].Add(
+            percent_referenced_for_existing_keys);
+        non_existing_keys_stats.Add(percent_referenced_for_non_existing_keys);
+        cf_non_existing_keys_stats_map[cf_name].Add(
+            percent_referenced_for_non_existing_keys);
+        block_access_stats.Add(percent_accesses_for_existing_keys);
+        cf_block_access_info[cf_name].Add(percent_accesses_for_existing_keys);
+      };
+  TraverseBlocks(block_callback);
+  fprintf(stdout,
+          "Histogram on the number of referenced keys existing in a block over "
+          "the total number of keys in a block: \n%s",
+          existing_keys_stats.ToString().c_str());
+  for (auto const& cf_stats : cf_existing_keys_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(
+      stdout,
+      "Histogram on the number of referenced keys DO NOT exist in a block over "
+      "the total number of keys in a block: \n%s",
+      non_existing_keys_stats.ToString().c_str());
+  for (auto const& cf_stats : cf_non_existing_keys_stats_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(stdout,
+          "Histogram on the number of accesses on keys exist in a block over "
+          "the total number of accesses in a block: \n%s",
+          block_access_stats.ToString().c_str());
+  for (auto const& cf_stats : cf_block_access_info) {
+    print_break_lines(/*num_break_lines=*/1);
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(
+      stdout,
+      "Histogram on the average number of accesses per key in a block: \n%s",
+      avg_naccesses_per_key_in_a_data_block.ToString().c_str());
+  for (auto const& cf_stats : cf_avg_naccesses_per_key_in_a_data_block) {
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+  print_break_lines(/*num_break_lines=*/1);
+  fprintf(stdout,
+          "Histogram on the standard deviation of the number of accesses per "
+          "key in a block: \n%s",
+          stdev_naccesses_per_key_in_a_data_block.ToString().c_str());
+  for (auto const& cf_stats : cf_stdev_naccesses_per_key_in_a_data_block) {
+    fprintf(stdout, "Break down by column family %s: \n%s",
+            cf_stats.first.c_str(), cf_stats.second.ToString().c_str());
+  }
+}
+
+void BlockCacheTraceAnalyzer::PrintStatsSummary() const {
+  uint64_t total_num_files = 0;
+  uint64_t total_num_blocks = 0;
+  uint64_t total_num_accesses = 0;
+  std::map<TraceType, uint64_t> bt_num_blocks_map;
+  std::map<TableReaderCaller, uint64_t> caller_num_access_map;
+  std::map<TableReaderCaller, std::map<TraceType, uint64_t>>
+      caller_bt_num_access_map;
+  std::map<TableReaderCaller, std::map<uint32_t, uint64_t>>
+      caller_level_num_access_map;
+  for (auto const& cf_aggregates : cf_aggregates_map_) {
+    // Stats per column family.
+    const std::string& cf_name = cf_aggregates.first;
+    uint64_t cf_num_files = 0;
+    uint64_t cf_num_blocks = 0;
+    std::map<TraceType, uint64_t> cf_bt_blocks;
+    uint64_t cf_num_accesses = 0;
+    std::map<TableReaderCaller, uint64_t> cf_caller_num_accesses_map;
+    std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+        cf_caller_level_num_accesses_map;
+    std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+        cf_caller_file_num_accesses_map;
+    std::map<TableReaderCaller, std::map<TraceType, uint64_t>>
+        cf_caller_bt_num_accesses_map;
+    total_num_files += cf_aggregates.second.fd_aggregates_map.size();
+    for (auto const& file_aggregates : cf_aggregates.second.fd_aggregates_map) {
+      // Stats per SST file.
+      const uint64_t fd = file_aggregates.first;
+      const uint32_t level = file_aggregates.second.level;
+      cf_num_files++;
+      for (auto const& block_type_aggregates :
+           file_aggregates.second.block_type_aggregates_map) {
+        // Stats per block type.
+        const TraceType type = block_type_aggregates.first;
+        cf_bt_blocks[type] +=
+            block_type_aggregates.second.block_access_info_map.size();
+        total_num_blocks +=
+            block_type_aggregates.second.block_access_info_map.size();
+        bt_num_blocks_map[type] +=
+            block_type_aggregates.second.block_access_info_map.size();
+        for (auto const& block_access_info :
+             block_type_aggregates.second.block_access_info_map) {
+          // Stats per block.
+          cf_num_blocks++;
+          for (auto const& stats :
+               block_access_info.second.caller_num_access_map) {
+            // Stats per caller.
+            const TableReaderCaller caller = stats.first;
+            const uint64_t num_accesses = stats.second;
+            // Overall stats.
+            total_num_accesses += num_accesses;
+            caller_num_access_map[caller] += num_accesses;
+            caller_bt_num_access_map[caller][type] += num_accesses;
+            caller_level_num_access_map[caller][level] += num_accesses;
+            // Column Family stats.
+            cf_num_accesses += num_accesses;
+            cf_caller_num_accesses_map[caller] += num_accesses;
+            cf_caller_level_num_accesses_map[caller][level] += num_accesses;
+            cf_caller_file_num_accesses_map[caller][fd] += num_accesses;
+            cf_caller_bt_num_accesses_map[caller][type] += num_accesses;
+          }
+        }
+      }
+    }
+
+    // Print stats.
+    print_break_lines(/*num_break_lines=*/3);
+    fprintf(stdout, "Statistics for column family %s:\n", cf_name.c_str());
+    fprintf(stdout,
+            " Number of files:%" PRIu64 " Number of blocks: %" PRIu64
+            " Number of accesses: %" PRIu64 "\n",
+            cf_num_files, cf_num_blocks, cf_num_accesses);
+    for (auto block_type : cf_bt_blocks) {
+      fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n",
+              block_type_to_string(block_type.first).c_str(), block_type.second,
+              percent(block_type.second, cf_num_blocks));
+    }
+    for (auto caller : cf_caller_num_accesses_map) {
+      const uint64_t naccesses = caller.second;
+      print_break_lines(/*num_break_lines=*/1);
+      fprintf(stdout,
+              "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n",
+              caller_to_string(caller.first).c_str(), naccesses,
+              percent(naccesses, cf_num_accesses));
+      fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
+              caller_to_string(caller.first).c_str());
+      for (auto naccess_level :
+           cf_caller_level_num_accesses_map[caller.first]) {
+        fprintf(stdout,
+                "\t Level %" PRIu64 ": Number of accesses: %" PRIu64
+                " Percent: %.2f\n",
+                naccess_level.first, naccess_level.second,
+                percent(naccess_level.second, naccesses));
+      }
+      fprintf(stdout, "Caller %s: Number of accesses per file break down\n",
+              caller_to_string(caller.first).c_str());
+      for (auto naccess_file : cf_caller_file_num_accesses_map[caller.first]) {
+        fprintf(stdout,
+                "\t File %" PRIu64 ": Number of accesses: %" PRIu64
+                " Percent: %.2f\n",
+                naccess_file.first, naccess_file.second,
+                percent(naccess_file.second, naccesses));
+      }
+      fprintf(stdout,
+              "Caller %s: Number of accesses per block type break down\n",
+              caller_to_string(caller.first).c_str());
+      for (auto naccess_type : cf_caller_bt_num_accesses_map[caller.first]) {
+        fprintf(stdout,
+                "\t Block Type %s: Number of accesses: %" PRIu64
+                " Percent: %.2f\n",
+                block_type_to_string(naccess_type.first).c_str(),
+                naccess_type.second, percent(naccess_type.second, naccesses));
+      }
+    }
+  }
+  print_break_lines(/*num_break_lines=*/3);
+  fprintf(stdout, "Overall statistics:\n");
+  fprintf(stdout,
+          "Number of files: %" PRIu64 " Number of blocks: %" PRIu64
+          " Number of accesses: %" PRIu64 "\n",
+          total_num_files, total_num_blocks, total_num_accesses);
+  for (auto block_type : bt_num_blocks_map) {
+    fprintf(stdout, "Number of %s blocks: %" PRIu64 " Percent: %.2f\n",
+            block_type_to_string(block_type.first).c_str(), block_type.second,
+            percent(block_type.second, total_num_blocks));
+  }
+  for (auto caller : caller_num_access_map) {
+    print_break_lines(/*num_break_lines=*/1);
+    uint64_t naccesses = caller.second;
+    fprintf(stdout, "Caller %s: Number of accesses %" PRIu64 " Percent: %.2f\n",
+            caller_to_string(caller.first).c_str(), naccesses,
+            percent(naccesses, total_num_accesses));
+    fprintf(stdout, "Caller %s: Number of accesses per level break down\n",
+            caller_to_string(caller.first).c_str());
+    for (auto naccess_level : caller_level_num_access_map[caller.first]) {
+      fprintf(stdout,
+              "\t Level %d: Number of accesses: %" PRIu64 " Percent: %.2f\n",
+              naccess_level.first, naccess_level.second,
+              percent(naccess_level.second, naccesses));
+    }
+    fprintf(stdout, "Caller %s: Number of accesses per block type break down\n",
+            caller_to_string(caller.first).c_str());
+    for (auto naccess_type : caller_bt_num_access_map[caller.first]) {
+      fprintf(stdout,
+              "\t Block Type %s: Number of accesses: %" PRIu64
+              " Percent: %.2f\n",
+              block_type_to_string(naccess_type.first).c_str(),
+              naccess_type.second, percent(naccess_type.second, naccesses));
+    }
+  }
+}
+
+std::vector<CacheConfiguration> parse_cache_config_file(
+    const std::string& config_path) {
+  std::ifstream file(config_path);
+  if (!file.is_open()) {
+    return {};
+  }
+  std::vector<CacheConfiguration> configs;
+  std::string line;
+  while (getline(file, line)) {
+    CacheConfiguration cache_config;
+    std::stringstream ss(line);
+    std::vector<std::string> config_strs;
+    while (ss.good()) {
+      std::string substr;
+      getline(ss, substr, ',');
+      config_strs.push_back(substr);
+    }
+    // Sanity checks.
+    if (config_strs.size() < 4) {
+      fprintf(stderr, "Invalid cache simulator configuration %s\n",
+              line.c_str());
+      exit(1);
+    }
+    if (kSupportedCacheNames.find(" " + config_strs[0] + " ") ==
+        std::string::npos) {
+      fprintf(stderr, "Invalid cache name %s. Supported cache names are %s\n",
+              line.c_str(), kSupportedCacheNames.c_str());
+      exit(1);
+    }
+    cache_config.cache_name = config_strs[0];
+    cache_config.num_shard_bits = ParseUint32(config_strs[1]);
+    cache_config.ghost_cache_capacity = ParseUint64(config_strs[2]);
+    for (uint32_t i = 3; i < config_strs.size(); i++) {
+      uint64_t capacity = ParseUint64(config_strs[i]);
+      if (capacity == 0) {
+        fprintf(stderr, "Invalid cache capacity %s, %s\n",
+                config_strs[i].c_str(), line.c_str());
+        exit(1);
+      }
+      cache_config.cache_capacities.push_back(capacity);
+    }
+    configs.push_back(cache_config);
+  }
+  file.close();
+  return configs;
+}
+
+std::vector<uint64_t> parse_buckets(const std::string& bucket_str) {
+  std::vector<uint64_t> buckets;
+  std::stringstream ss(bucket_str);
+  while (ss.good()) {
+    std::string bucket;
+    getline(ss, bucket, ',');
+    buckets.push_back(ParseUint64(bucket));
+  }
+  buckets.push_back(port::kMaxUint64);
+  return buckets;
+}
+
+int block_cache_trace_analyzer_tool(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_block_cache_trace_path.empty()) {
+    fprintf(stderr, "block cache trace path is empty\n");
+    exit(1);
+  }
+  uint64_t warmup_seconds =
+      FLAGS_cache_sim_warmup_seconds > 0 ? FLAGS_cache_sim_warmup_seconds : 0;
+  uint32_t downsample_ratio = FLAGS_block_cache_trace_downsample_ratio > 0
+                                  ? FLAGS_block_cache_trace_downsample_ratio
+                                  : 0;
+  std::vector<CacheConfiguration> cache_configs =
+      parse_cache_config_file(FLAGS_block_cache_sim_config_path);
+  std::unique_ptr<BlockCacheTraceSimulator> cache_simulator;
+  if (!cache_configs.empty()) {
+    cache_simulator.reset(new BlockCacheTraceSimulator(
+        warmup_seconds, downsample_ratio, cache_configs));
+    Status s = cache_simulator->InitializeCaches();
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot initialize cache simulators %s\n",
+              s.ToString().c_str());
+      exit(1);
+    }
+  }
+  BlockCacheTraceAnalyzer analyzer(
+      FLAGS_block_cache_trace_path, FLAGS_block_cache_analysis_result_dir,
+      FLAGS_human_readable_trace_file_path,
+      !FLAGS_reuse_distance_labels.empty(), FLAGS_mrc_only,
+      FLAGS_is_block_cache_human_readable_trace, std::move(cache_simulator));
+  Status s = analyzer.Analyze();
+  if (!s.IsIncomplete() && !s.ok()) {
+    // Read all traces.
+    fprintf(stderr, "Cannot process the trace %s\n", s.ToString().c_str());
+    exit(1);
+  }
+  fprintf(stdout, "Status: %s\n", s.ToString().c_str());
+  analyzer.WriteMissRatioCurves();
+  analyzer.WriteMissRatioTimeline(1);
+  analyzer.WriteMissRatioTimeline(kSecondInMinute);
+  analyzer.WriteMissRatioTimeline(kSecondInHour);
+  analyzer.WriteMissTimeline(1);
+  analyzer.WriteMissTimeline(kSecondInMinute);
+  analyzer.WriteMissTimeline(kSecondInHour);
+
+  if (FLAGS_mrc_only) {
+    fprintf(stdout,
+            "Skipping the analysis statistics since the user wants to compute "
+            "MRC only");
+    return 0;
+  }
+
+  analyzer.PrintStatsSummary();
+  if (FLAGS_print_access_count_stats) {
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintAccessCountStats(
+        /*user_access_only=*/false, FLAGS_analyze_bottom_k_access_count_blocks,
+        FLAGS_analyze_top_k_access_count_blocks);
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintAccessCountStats(
+        /*user_access_only=*/true, FLAGS_analyze_bottom_k_access_count_blocks,
+        FLAGS_analyze_top_k_access_count_blocks);
+  }
+  if (FLAGS_print_block_size_stats) {
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintBlockSizeStats();
+  }
+  if (FLAGS_print_data_block_access_count_stats) {
+    print_break_lines(/*num_break_lines=*/3);
+    analyzer.PrintDataBlockAccessStats();
+  }
+  print_break_lines(/*num_break_lines=*/3);
+
+  if (!FLAGS_timeline_labels.empty()) {
+    std::stringstream ss(FLAGS_timeline_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      if (label.find("block") != std::string::npos) {
+        analyzer.WriteAccessTimeline(label, kSecondInMinute, true);
+        analyzer.WriteAccessTimeline(label, kSecondInMinute, false);
+        analyzer.WriteAccessTimeline(label, kSecondInHour, true);
+        analyzer.WriteAccessTimeline(label, kSecondInHour, false);
+      } else {
+        analyzer.WriteAccessTimeline(label, kSecondInMinute, false);
+        analyzer.WriteAccessTimeline(label, kSecondInHour, false);
+      }
+    }
+  }
+
+  if (!FLAGS_analyze_callers.empty()) {
+    analyzer.WritePercentAccessSummaryStats();
+    std::stringstream ss(FLAGS_analyze_callers);
+    while (ss.good()) {
+      std::string caller;
+      getline(ss, caller, ',');
+      analyzer.WriteDetailedPercentAccessSummaryStats(string_to_caller(caller));
+    }
+  }
+
+  if (!FLAGS_access_count_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_access_count_buckets);
+    analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/true);
+    analyzer.WriteAccessCountSummaryStats(buckets, /*user_access_only=*/false);
+  }
+
+  if (!FLAGS_reuse_distance_labels.empty() &&
+      !FLAGS_reuse_distance_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_distance_buckets);
+    std::stringstream ss(FLAGS_reuse_distance_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteReuseDistance(label, buckets);
+    }
+  }
+
+  if (!FLAGS_reuse_interval_labels.empty() &&
+      !FLAGS_reuse_interval_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_interval_buckets);
+    std::stringstream ss(FLAGS_reuse_interval_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteReuseInterval(label, buckets);
+    }
+  }
+
+  if (!FLAGS_reuse_lifetime_labels.empty() &&
+      !FLAGS_reuse_lifetime_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_reuse_lifetime_buckets);
+    std::stringstream ss(FLAGS_reuse_lifetime_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteReuseLifetime(label, buckets);
+    }
+  }
+
+  if (FLAGS_analyze_blocks_reuse_k_reuse_window != 0) {
+    std::vector<TraceType> block_types{TraceType::kBlockTraceIndexBlock,
+                                       TraceType::kBlockTraceDataBlock,
+                                       TraceType::kBlockTraceFilterBlock};
+    for (auto block_type : block_types) {
+      analyzer.WriteBlockReuseTimeline(
+          FLAGS_analyze_blocks_reuse_k_reuse_window,
+          /*user_access_only=*/true, block_type);
+      analyzer.WriteBlockReuseTimeline(
+          FLAGS_analyze_blocks_reuse_k_reuse_window,
+          /*user_access_only=*/false, block_type);
+    }
+  }
+
+  if (!FLAGS_analyze_get_spatial_locality_labels.empty() &&
+      !FLAGS_analyze_get_spatial_locality_buckets.empty()) {
+    std::vector<uint64_t> buckets =
+        parse_buckets(FLAGS_analyze_get_spatial_locality_buckets);
+    std::stringstream ss(FLAGS_analyze_get_spatial_locality_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteGetSpatialLocality(label, buckets);
+    }
+  }
+
+  if (!FLAGS_analyze_correlation_coefficients_labels.empty()) {
+    std::stringstream ss(FLAGS_analyze_correlation_coefficients_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      analyzer.WriteCorrelationFeatures(
+          label, FLAGS_analyze_correlation_coefficients_max_number_of_values);
+    }
+    analyzer.WriteCorrelationFeaturesForGet(
+        FLAGS_analyze_correlation_coefficients_max_number_of_values);
+  }
+
+  if (!FLAGS_skew_labels.empty() && !FLAGS_skew_buckets.empty()) {
+    std::vector<uint64_t> buckets = parse_buckets(FLAGS_skew_buckets);
+    std::stringstream ss(FLAGS_skew_labels);
+    while (ss.good()) {
+      std::string label;
+      getline(ss, label, ',');
+      if (label.find("block") != std::string::npos) {
+        analyzer.WriteSkewness(label, buckets,
+                               TraceType::kBlockTraceIndexBlock);
+        analyzer.WriteSkewness(label, buckets,
+                               TraceType::kBlockTraceFilterBlock);
+        analyzer.WriteSkewness(label, buckets, TraceType::kBlockTraceDataBlock);
+        analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax);
+      } else {
+        analyzer.WriteSkewness(label, buckets, TraceType::kTraceMax);
+      }
+    }
+  }
+  return 0;
+}
+
+}  // namespace rocksdb
+
+#endif  // GFLAGS
+#endif  // ROCKSDB_LITE
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
new file mode 100644
index 00000000000..4a15429e4c0
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
@@ -0,0 +1,393 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/sim_cache.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "utilities/simulator_cache/cache_simulator.h"
+
+namespace rocksdb {
+
+// Statistics of a key refereneced by a Get.
+struct GetKeyInfo {
+  uint64_t key_id = 0;
+  std::vector<uint64_t> access_sequence_number_timeline;
+  std::vector<uint64_t> access_timeline;
+
+  void AddAccess(const BlockCacheTraceRecord& access,
+                 uint64_t access_sequnce_number) {
+    access_sequence_number_timeline.push_back(access_sequnce_number);
+    access_timeline.push_back(access.access_timestamp);
+  }
+};
+
+// Statistics of a block.
+struct BlockAccessInfo {
+  uint64_t block_id = 0;
+  uint64_t table_id = 0;
+  uint64_t block_offset = 0;
+  uint64_t num_accesses = 0;
+  uint64_t block_size = 0;
+  uint64_t first_access_time = 0;
+  uint64_t last_access_time = 0;
+  uint64_t num_keys = 0;
+  std::map<std::string, std::map<TableReaderCaller, uint64_t>>
+      key_num_access_map;  // for keys exist in this block.
+  std::map<std::string, std::map<TableReaderCaller, uint64_t>>
+      non_exist_key_num_access_map;  // for keys do not exist in this block.
+  uint64_t num_referenced_key_exist_in_block = 0;
+  uint64_t referenced_data_size = 0;
+  std::map<TableReaderCaller, uint64_t> caller_num_access_map;
+  // caller:timestamp:number_of_accesses. The granularity of the timestamp is
+  // seconds.
+  std::map<TableReaderCaller, std::map<uint64_t, uint64_t>>
+      caller_num_accesses_timeline;
+  // Unique blocks since the last access.
+  std::set<std::string> unique_blocks_since_last_access;
+  // Number of reuses grouped by reuse distance.
+  std::map<uint64_t, uint64_t> reuse_distance_count;
+
+  // The access sequence numbers of this block.
+  std::vector<uint64_t> access_sequence_number_timeline;
+  std::map<TableReaderCaller, std::vector<uint64_t>>
+      caller_access_sequence__number_timeline;
+  // The access timestamp in microseconds of this block.
+  std::vector<uint64_t> access_timeline;
+  std::map<TableReaderCaller, std::vector<uint64_t>> caller_access_timeline;
+
+  void AddAccess(const BlockCacheTraceRecord& access,
+                 uint64_t access_sequnce_number) {
+    if (block_size != 0 && access.block_size != 0) {
+      assert(block_size == access.block_size);
+    }
+    if (num_keys != 0 && access.num_keys_in_block != 0) {
+      assert(num_keys == access.num_keys_in_block);
+    }
+    if (first_access_time == 0) {
+      first_access_time = access.access_timestamp;
+    }
+    table_id = BlockCacheTraceHelper::GetTableId(access);
+    block_offset = BlockCacheTraceHelper::GetBlockOffsetInFile(access);
+    last_access_time = access.access_timestamp;
+    block_size = access.block_size;
+    caller_num_access_map[access.caller]++;
+    num_accesses++;
+    // access.access_timestamp is in microsecond.
+    const uint64_t timestamp_in_seconds =
+        access.access_timestamp / kMicrosInSecond;
+    caller_num_accesses_timeline[access.caller][timestamp_in_seconds] += 1;
+    // Populate the feature vectors.
+    access_sequence_number_timeline.push_back(access_sequnce_number);
+    caller_access_sequence__number_timeline[access.caller].push_back(
+        access_sequnce_number);
+    access_timeline.push_back(access.access_timestamp);
+    caller_access_timeline[access.caller].push_back(access.access_timestamp);
+    if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type,
+                                                          access.caller)) {
+      num_keys = access.num_keys_in_block;
+      if (access.referenced_key_exist_in_block == Boolean::kTrue) {
+        if (key_num_access_map.find(access.referenced_key) ==
+            key_num_access_map.end()) {
+          referenced_data_size += access.referenced_data_size;
+        }
+        key_num_access_map[access.referenced_key][access.caller]++;
+        num_referenced_key_exist_in_block++;
+        if (referenced_data_size > block_size && block_size != 0) {
+          ParsedInternalKey internal_key;
+          ParseInternalKey(access.referenced_key, &internal_key);
+        }
+      } else {
+        non_exist_key_num_access_map[access.referenced_key][access.caller]++;
+      }
+    }
+  }
+};
+
+// Aggregates stats of a block given a block type.
+struct BlockTypeAccessInfoAggregate {
+  std::map<std::string, BlockAccessInfo> block_access_info_map;
+};
+
+// Aggregates BlockTypeAggregate given a SST file.
+struct SSTFileAccessInfoAggregate {
+  uint32_t level;
+  std::map<TraceType, BlockTypeAccessInfoAggregate> block_type_aggregates_map;
+};
+
+// Aggregates SSTFileAggregate given a column family.
+struct ColumnFamilyAccessInfoAggregate {
+  std::map<uint64_t, SSTFileAccessInfoAggregate> fd_aggregates_map;
+};
+
+struct Features {
+  std::vector<uint64_t> elapsed_time_since_last_access;
+  std::vector<uint64_t> num_accesses_since_last_access;
+  std::vector<uint64_t> num_past_accesses;
+};
+
+struct Predictions {
+  std::vector<uint64_t> elapsed_time_till_next_access;
+  std::vector<uint64_t> num_accesses_till_next_access;
+};
+
+class BlockCacheTraceAnalyzer {
+ public:
+  BlockCacheTraceAnalyzer(
+      const std::string& trace_file_path, const std::string& output_dir,
+      const std::string& human_readable_trace_file_path,
+      bool compute_reuse_distance, bool mrc_only,
+      bool is_human_readable_trace_file,
+      std::unique_ptr<BlockCacheTraceSimulator>&& cache_simulator);
+  ~BlockCacheTraceAnalyzer() = default;
+  // No copy and move.
+  BlockCacheTraceAnalyzer(const BlockCacheTraceAnalyzer&) = delete;
+  BlockCacheTraceAnalyzer& operator=(const BlockCacheTraceAnalyzer&) = delete;
+  BlockCacheTraceAnalyzer(BlockCacheTraceAnalyzer&&) = delete;
+  BlockCacheTraceAnalyzer& operator=(BlockCacheTraceAnalyzer&&) = delete;
+
+  // Read all access records in the given trace_file, maintains the stats of
+  // a block, and aggregates the information by block type, sst file, and column
+  // family. Subsequently, the caller may call Print* functions to print
+  // statistics.
+  Status Analyze();
+
+  // Print a summary of statistics of the trace, e.g.,
+  // Number of files: 2 Number of blocks: 50 Number of accesses: 50
+  // Number of Index blocks: 10
+  // Number of Filter blocks: 10
+  // Number of Data blocks: 10
+  // Number of UncompressionDict blocks: 10
+  // Number of RangeDeletion blocks: 10
+  // ***************************************************************
+  // Caller Get: Number of accesses 10
+  // Caller Get: Number of accesses per level break down
+  //          Level 0: Number of accesses: 10
+  // Caller Get: Number of accesses per block type break down
+  //          Block Type Index: Number of accesses: 2
+  //          Block Type Filter: Number of accesses: 2
+  //          Block Type Data: Number of accesses: 2
+  //          Block Type UncompressionDict: Number of accesses: 2
+  //          Block Type RangeDeletion: Number of accesses: 2
+  void PrintStatsSummary() const;
+
+  // Print block size distribution and the distribution break down by block type
+  // and column family.
+  void PrintBlockSizeStats() const;
+
+  // Print access count distribution and the distribution break down by block
+  // type and column family.
+  void PrintAccessCountStats(bool user_access_only, uint32_t bottom_k,
+                             uint32_t top_k) const;
+
+  // Print data block accesses by user Get and Multi-Get.
+  // It prints out 1) A histogram on the percentage of keys accessed in a data
+  // block break down by if a referenced key exists in the data block andthe
+  // histogram break down by column family. 2) A histogram on the percentage of
+  // accesses on keys exist in a data block and its break down by column family.
+  void PrintDataBlockAccessStats() const;
+
+  // Write the percentage of accesses break down by column family into a csv
+  // file saved in 'output_dir'.
+  //
+  // The file is named "percentage_of_accesses_summary". The file format is
+  // caller,cf_0,cf_1,...,cf_n where the cf_i is the column family name found in
+  // the trace.
+  void WritePercentAccessSummaryStats() const;
+
+  // Write the percentage of accesses for the given caller break down by column
+  // family, level, and block type into a csv file saved in 'output_dir'.
+  //
+  // It generates two files: 1) caller_level_percentage_of_accesses_summary and
+  // 2) caller_bt_percentage_of_accesses_summary which break down by the level
+  // and block type, respectively. The file format is
+  // level/bt,cf_0,cf_1,...,cf_n where cf_i is the column family name found in
+  // the trace.
+  void WriteDetailedPercentAccessSummaryStats(TableReaderCaller caller) const;
+
+  // Write the access count summary into a csv file saved in 'output_dir'.
+  // It groups blocks by their access count.
+  //
+  // It generates two files: 1) cf_access_count_summary and 2)
+  // bt_access_count_summary which break down the access count by column family
+  // and block type, respectively. The file format is
+  // cf/bt,bucket_0,bucket_1,...,bucket_N.
+  void WriteAccessCountSummaryStats(
+      const std::vector<uint64_t>& access_count_buckets,
+      bool user_access_only) const;
+
+  // Write miss ratio curves of simulated cache configurations into a csv file
+  // named "mrc" saved in 'output_dir'.
+  //
+  // The file format is
+  // "cache_name,num_shard_bits,capacity,miss_ratio,total_accesses".
+  void WriteMissRatioCurves() const;
+
+  // Write miss ratio timeline of simulated cache configurations into several
+  // csv files, one per cache capacity saved in 'output_dir'.
+  //
+  // The file format is
+  // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+  // where N is the number of unique cache names
+  // (cache_name+num_shard_bits+ghost_capacity).
+  void WriteMissRatioTimeline(uint64_t time_unit) const;
+
+  // Write misses timeline of simulated cache configurations into several
+  // csv files, one per cache capacity saved in 'output_dir'.
+  //
+  // The file format is
+  // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+  // where N is the number of unique cache names
+  // (cache_name+num_shard_bits+ghost_capacity).
+  void WriteMissTimeline(uint64_t time_unit) const;
+
+  // Write the access timeline into a csv file saved in 'output_dir'.
+  //
+  // The file is named "label_access_timeline".The file format is
+  // "time,label_1_access_per_second,label_2_access_per_second,...,label_N_access_per_second"
+  // where N is the number of unique labels found in the trace.
+  void WriteAccessTimeline(const std::string& label, uint64_t time_unit,
+                           bool user_access_only) const;
+
+  // Write the reuse distance into a csv file saved in 'output_dir'. Reuse
+  // distance is defined as the cumulated size of unique blocks read between two
+  // consective accesses on the same block.
+  //
+  // The file is named "label_reuse_distance". The file format is
+  // bucket,label_1,label_2,...,label_N.
+  void WriteReuseDistance(const std::string& label_str,
+                          const std::vector<uint64_t>& distance_buckets) const;
+
+  // Write the reuse interval into a csv file saved in 'output_dir'. Reuse
+  // interval is defined as the time between two consecutive accesses on the
+  // same block.
+  //
+  // The file is named "label_reuse_interval". The file format is
+  // bucket,label_1,label_2,...,label_N.
+  void WriteReuseInterval(const std::string& label_str,
+                          const std::vector<uint64_t>& time_buckets) const;
+
+  // Write the reuse lifetime into a csv file saved in 'output_dir'. Reuse
+  // lifetime is defined as the time interval between the first access of a
+  // block and its last access.
+  //
+  // The file is named "label_reuse_lifetime". The file format is
+  // bucket,label_1,label_2,...,label_N.
+  void WriteReuseLifetime(const std::string& label_str,
+                          const std::vector<uint64_t>& time_buckets) const;
+
+  // Write the reuse timeline into a csv file saved in 'output_dir'.
+  //
+  // The file is named
+  // "block_type_user_access_only_reuse_window_reuse_timeline". The file format
+  // is start_time,0,1,...,N where N equals trace_duration / reuse_window.
+  void WriteBlockReuseTimeline(const uint64_t reuse_window, bool user_access_only,
+                               TraceType block_type) const;
+
+  // Write the Get spatical locality into csv files saved in 'output_dir'.
+  //
+  // It generates three csv files. label_percent_ref_keys,
+  // label_percent_accesses_on_ref_keys, and
+  // label_percent_data_size_on_ref_keys.
+  void WriteGetSpatialLocality(
+      const std::string& label_str,
+      const std::vector<uint64_t>& percent_buckets) const;
+
+  void WriteCorrelationFeatures(const std::string& label_str,
+                                uint32_t max_number_of_values) const;
+
+  void WriteCorrelationFeaturesForGet(uint32_t max_number_of_values) const;
+
+  void WriteSkewness(const std::string& label_str,
+                     const std::vector<uint64_t>& percent_buckets,
+                     TraceType target_block_type) const;
+
+  const std::map<std::string, ColumnFamilyAccessInfoAggregate>&
+  TEST_cf_aggregates_map() const {
+    return cf_aggregates_map_;
+  }
+
+ private:
+  std::set<std::string> ParseLabelStr(const std::string& label_str) const;
+
+  std::string BuildLabel(const std::set<std::string>& labels,
+                         const std::string& cf_name, uint64_t fd,
+                         uint32_t level, TraceType type,
+                         TableReaderCaller caller, uint64_t block_key,
+                         const BlockAccessInfo& block) const;
+
+  void ComputeReuseDistance(BlockAccessInfo* info) const;
+
+  Status RecordAccess(const BlockCacheTraceRecord& access);
+
+  void UpdateReuseIntervalStats(
+      const std::string& label, const std::vector<uint64_t>& time_buckets,
+      const std::map<uint64_t, uint64_t> timeline,
+      std::map<std::string, std::map<uint64_t, uint64_t>>*
+          label_time_num_reuses,
+      uint64_t* total_num_reuses) const;
+
+  std::string OutputPercentAccessStats(
+      uint64_t total_accesses,
+      const std::map<std::string, uint64_t>& cf_access_count) const;
+
+  void WriteStatsToFile(
+      const std::string& label_str, const std::vector<uint64_t>& time_buckets,
+      const std::string& filename_suffix,
+      const std::map<std::string, std::map<uint64_t, uint64_t>>& label_data,
+      uint64_t ntotal) const;
+
+  void TraverseBlocks(
+      std::function<void(const std::string& /*cf_name*/, uint64_t /*fd*/,
+                         uint32_t /*level*/, TraceType /*block_type*/,
+                         const std::string& /*block_key*/,
+                         uint64_t /*block_key_id*/,
+                         const BlockAccessInfo& /*block_access_info*/)>
+          block_callback,
+      std::set<std::string>* labels = nullptr) const;
+
+  void UpdateFeatureVectors(
+      const std::vector<uint64_t>& access_sequence_number_timeline,
+      const std::vector<uint64_t>& access_timeline, const std::string& label,
+      std::map<std::string, Features>* label_features,
+      std::map<std::string, Predictions>* label_predictions) const;
+
+  void WriteCorrelationFeaturesToFile(
+      const std::string& label,
+      const std::map<std::string, Features>& label_features,
+      const std::map<std::string, Predictions>& label_predictions,
+      uint32_t max_number_of_values) const;
+
+  rocksdb::Env* env_;
+  const std::string trace_file_path_;
+  const std::string output_dir_;
+  std::string human_readable_trace_file_path_;
+  const bool compute_reuse_distance_;
+  const bool mrc_only_;
+  const bool is_human_readable_trace_file_;
+
+  BlockCacheTraceHeader header_;
+  std::unique_ptr<BlockCacheTraceSimulator> cache_simulator_;
+  std::map<std::string, ColumnFamilyAccessInfoAggregate> cf_aggregates_map_;
+  std::map<std::string, BlockAccessInfo*> block_info_map_;
+  std::unordered_map<std::string, GetKeyInfo> get_key_info_map_;
+  uint64_t access_sequence_number_ = 0;
+  uint64_t trace_start_timestamp_in_seconds_ = 0;
+  uint64_t trace_end_timestamp_in_seconds_ = 0;
+  MissRatioStats miss_ratio_stats_;
+  uint64_t unique_block_id_ = 1;
+  uint64_t unique_get_key_id_ = 1;
+  BlockCacheHumanReadableTraceWriter human_readable_trace_writer_;
+};
+
+int block_cache_trace_analyzer_tool(int argc, char** argv);
+
+}  // namespace rocksdb
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
new file mode 100644
index 00000000000..0fdaa41586e
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py
@@ -0,0 +1,721 @@
+#!/usr/bin/env python3
+import csv
+import math
+import os
+import random
+import sys
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.backends.backend_pdf
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+
+# Make sure a legend has the same color across all generated graphs.
+def get_cmap(n, name="hsv"):
+    """Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
+    RGB color; the keyword argument name must be a standard mpl colormap name."""
+    return plt.cm.get_cmap(name, n)
+
+
+color_index = 0
+bar_color_maps = {}
+colors = []
+n_colors = 360
+linear_colors = get_cmap(n_colors)
+for i in range(n_colors):
+    colors.append(linear_colors(i))
+# Shuffle the colors so that adjacent bars in a graph are obvious to differentiate.
+random.shuffle(colors)
+
+
+def num_to_gb(n):
+    one_gb = 1024 * 1024 * 1024
+    if float(n) % one_gb == 0:
+        return "{}".format(n / one_gb)
+    # Keep two decimal points.
+    return "{0:.2f}".format(float(n) / one_gb)
+
+
+def plot_miss_stats_graphs(
+    csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
+):
+    miss_ratios = {}
+    for file in os.listdir(csv_result_dir):
+        if not file.startswith(file_prefix):
+            continue
+        if not file.endswith(file_suffix):
+            continue
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        mrc_file_path = csv_result_dir + "/" + file
+        with open(mrc_file_path, "r") as csvfile:
+            rows = csv.reader(csvfile, delimiter=",")
+            for row in rows:
+                cache_name = row[0]
+                num_shard_bits = int(row[1])
+                ghost_capacity = int(row[2])
+                capacity = int(row[3])
+                miss_ratio = float(row[4])
+                config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
+                if config not in miss_ratios:
+                    miss_ratios[config] = {}
+                    miss_ratios[config]["x"] = []
+                    miss_ratios[config]["y"] = []
+                miss_ratios[config]["x"].append(capacity)
+                miss_ratios[config]["y"].append(miss_ratio)
+            fig = plt.figure()
+            for config in miss_ratios:
+                plt.plot(
+                    miss_ratios[config]["x"], miss_ratios[config]["y"], label=config
+                )
+            plt.xlabel("Cache capacity")
+            plt.ylabel(ylabel)
+            plt.xscale("log", basex=2)
+            plt.ylim(ymin=0)
+            plt.title("{}".format(file))
+            plt.legend()
+            fig.savefig(
+                output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
+            )
+
+
+def plot_miss_stats_diff_lru_graphs(
+    csv_result_dir, output_result_dir, file_prefix, file_suffix, ylabel, pdf_file_name
+):
+    miss_ratios = {}
+    for file in os.listdir(csv_result_dir):
+        if not file.startswith(file_prefix):
+            continue
+        if not file.endswith(file_suffix):
+            continue
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        mrc_file_path = csv_result_dir + "/" + file
+        with open(mrc_file_path, "r") as csvfile:
+            rows = csv.reader(csvfile, delimiter=",")
+            for row in rows:
+                cache_name = row[0]
+                num_shard_bits = int(row[1])
+                ghost_capacity = int(row[2])
+                capacity = int(row[3])
+                miss_ratio = float(row[4])
+                config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity)
+                if config not in miss_ratios:
+                    miss_ratios[config] = {}
+                    miss_ratios[config]["x"] = []
+                    miss_ratios[config]["y"] = []
+                miss_ratios[config]["x"].append(capacity)
+                miss_ratios[config]["y"].append(miss_ratio)
+    if "lru-0-0" not in miss_ratios:
+        return
+    fig = plt.figure()
+    for config in miss_ratios:
+        diffs = [0] * len(miss_ratios["lru-0-0"]["x"])
+        for i in range(len(miss_ratios["lru-0-0"]["x"])):
+            for j in range(len(miss_ratios[config]["x"])):
+                if miss_ratios["lru-0-0"]["x"][i] == miss_ratios[config]["x"][j]:
+                    diffs[i] = (
+                        miss_ratios[config]["y"][j] - miss_ratios["lru-0-0"]["y"][i]
+                    )
+                    break
+        plt.plot(miss_ratios["lru-0-0"]["x"], diffs, label=config)
+    plt.xlabel("Cache capacity")
+    plt.ylabel(ylabel)
+    plt.xscale("log", basex=2)
+    plt.title("{}".format(file))
+    plt.legend()
+    fig.savefig(
+        output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight"
+    )
+
+
+def sanitize(label):
+    # matplotlib cannot plot legends that is prefixed with "_"
+    # so we need to remove them here.
+    index = 0
+    for i in range(len(label)):
+        if label[i] == "_":
+            index += 1
+        else:
+            break
+    data = label[index:]
+    # The value of uint64_max in c++.
+    if "18446744073709551615" in data:
+        return "max"
+    return data
+
+
+# Read the csv file vertically, i.e., group the data by columns.
+def read_data_for_plot_vertical(csvfile):
+    x = []
+    labels = []
+    label_stats = {}
+    csv_rows = csv.reader(csvfile, delimiter=",")
+    data_rows = []
+    for row in csv_rows:
+        data_rows.append(row)
+    # header
+    for i in range(1, len(data_rows[0])):
+        labels.append(sanitize(data_rows[0][i]))
+        label_stats[i - 1] = []
+    for i in range(1, len(data_rows)):
+        for j in range(len(data_rows[i])):
+            if j == 0:
+                x.append(sanitize(data_rows[i][j]))
+                continue
+            label_stats[j - 1].append(float(data_rows[i][j]))
+    return x, labels, label_stats
+
+
+# Read the csv file horizontally, i.e., group the data by rows.
+def read_data_for_plot_horizontal(csvfile):
+    x = []
+    labels = []
+    label_stats = {}
+    csv_rows = csv.reader(csvfile, delimiter=",")
+    data_rows = []
+    for row in csv_rows:
+        data_rows.append(row)
+    # header
+    for i in range(1, len(data_rows)):
+        labels.append(sanitize(data_rows[i][0]))
+        label_stats[i - 1] = []
+    for i in range(1, len(data_rows[0])):
+        x.append(sanitize(data_rows[0][i]))
+    for i in range(1, len(data_rows)):
+        for j in range(len(data_rows[i])):
+            if j == 0:
+                # label
+                continue
+            label_stats[i - 1].append(float(data_rows[i][j]))
+    return x, labels, label_stats
+
+
+def read_data_for_plot(csvfile, vertical):
+    if vertical:
+        return read_data_for_plot_vertical(csvfile)
+    return read_data_for_plot_horizontal(csvfile)
+
+
+def plot_line_charts(
+    csv_result_dir,
+    output_result_dir,
+    filename_prefix,
+    filename_suffix,
+    pdf_name,
+    xlabel,
+    ylabel,
+    title,
+    vertical,
+    legend,
+):
+    global color_index, bar_color_maps, colors
+    pdf = matplotlib.backends.backend_pdf.PdfPages(output_result_dir + "/" + pdf_name)
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith(filename_suffix):
+            continue
+        if not file.startswith(filename_prefix):
+            continue
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        with open(csv_result_dir + "/" + file, "r") as csvfile:
+            x, labels, label_stats = read_data_for_plot(csvfile, vertical)
+            if len(x) == 0 or len(labels) == 0:
+                continue
+            # plot figure
+            fig = plt.figure()
+            for label_index in label_stats:
+                # Assign a unique color to this label.
+                if labels[label_index] not in bar_color_maps:
+                    bar_color_maps[labels[label_index]] = colors[color_index]
+                    color_index += 1
+                plt.plot(
+                    [int(x[i]) for i in range(len(x) - 1)],
+                    label_stats[label_index][:-1],
+                    label=labels[label_index],
+                    color=bar_color_maps[labels[label_index]],
+                )
+
+            # Translate time unit into x labels.
+            if "_60" in file:
+                plt.xlabel("{} (Minute)".format(xlabel))
+            if "_3600" in file:
+                plt.xlabel("{} (Hour)".format(xlabel))
+            plt.ylabel(ylabel)
+            plt.title("{} {}".format(title, file))
+            if legend:
+                plt.legend()
+            pdf.savefig(fig)
+    pdf.close()
+
+
+def plot_stacked_bar_charts(
+    csv_result_dir,
+    output_result_dir,
+    filename_suffix,
+    pdf_name,
+    xlabel,
+    ylabel,
+    title,
+    vertical,
+    x_prefix,
+):
+    global color_index, bar_color_maps, colors
+    pdf = matplotlib.backends.backend_pdf.PdfPages(
+        "{}/{}".format(output_result_dir, pdf_name)
+    )
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith(filename_suffix):
+            continue
+        with open(csv_result_dir + "/" + file, "r") as csvfile:
+            print("Processing file {}/{}".format(csv_result_dir, file))
+            x, labels, label_stats = read_data_for_plot(csvfile, vertical)
+            if len(x) == 0 or len(label_stats) == 0:
+                continue
+            # Plot figure
+            fig = plt.figure()
+            ind = np.arange(len(x))  # the x locations for the groups
+            width = 0.5  # the width of the bars: can also be len(x) sequence
+            bars = []
+            bottom_bars = []
+            for _i in label_stats[0]:
+                bottom_bars.append(0)
+            for i in range(0, len(label_stats)):
+                # Assign a unique color to this label.
+                if labels[i] not in bar_color_maps:
+                    bar_color_maps[labels[i]] = colors[color_index]
+                    color_index += 1
+                p = plt.bar(
+                    ind,
+                    label_stats[i],
+                    width,
+                    bottom=bottom_bars,
+                    color=bar_color_maps[labels[i]],
+                )
+                bars.append(p[0])
+                for j in range(len(label_stats[i])):
+                    bottom_bars[j] += label_stats[i][j]
+            plt.xlabel(xlabel)
+            plt.ylabel(ylabel)
+            plt.xticks(
+                ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8
+            )
+            plt.legend(bars, labels)
+            plt.title("{} filename:{}".format(title, file))
+            pdf.savefig(fig)
+    pdf.close()
+
+
+def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title):
+    pdf = matplotlib.backends.backend_pdf.PdfPages(
+        "{}/{}".format(output_result_dir, pdf_name)
+    )
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith(filename_suffix):
+            continue
+        csv_file_name = "{}/{}".format(csv_result_dir, file)
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        corr_table = pd.read_csv(csv_file_name)
+        corr_table = corr_table.pivot("label", "corr", "value")
+        fig = plt.figure()
+        sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2")
+        plt.title("{} filename:{}".format(title, file))
+        pdf.savefig(fig)
+    pdf.close()
+
+
+def plot_timeline(csv_result_dir, output_result_dir):
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="access_timeline",
+        pdf_name="access_time.pdf",
+        xlabel="Time",
+        ylabel="Throughput",
+        title="Access timeline with group by label",
+        vertical=False,
+        legend=True,
+    )
+
+
+def convert_to_0_if_nan(n):
+    if math.isnan(n):
+        return 0.0
+    return n
+
+
+def plot_correlation(csv_result_dir, output_result_dir):
+    # Processing the correlation input first.
+    label_str_file = {}
+    for file in os.listdir(csv_result_dir):
+        if not file.endswith("correlation_input"):
+            continue
+        csv_file_name = "{}/{}".format(csv_result_dir, file)
+        print("Processing file {}/{}".format(csv_result_dir, file))
+        corr_table = pd.read_csv(csv_file_name)
+        label_str = file.split("_")[0]
+        label = file[len(label_str) + 1 :]
+        label = label[: len(label) - len("_correlation_input")]
+
+        output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str)
+        if output_file not in label_str_file:
+            f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+")
+            label_str_file[output_file] = f
+            f.write("label,corr,value\n")
+        f = label_str_file[output_file]
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LA+A",
+                convert_to_0_if_nan(
+                    corr_table["num_accesses_since_last_access"].corr(
+                        corr_table["num_accesses_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "PA+A",
+                convert_to_0_if_nan(
+                    corr_table["num_past_accesses"].corr(
+                        corr_table["num_accesses_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LT+A",
+                convert_to_0_if_nan(
+                    corr_table["elapsed_time_since_last_access"].corr(
+                        corr_table["num_accesses_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LA+T",
+                convert_to_0_if_nan(
+                    corr_table["num_accesses_since_last_access"].corr(
+                        corr_table["elapsed_time_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "LT+T",
+                convert_to_0_if_nan(
+                    corr_table["elapsed_time_since_last_access"].corr(
+                        corr_table["elapsed_time_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+        f.write(
+            "{},{},{}\n".format(
+                label,
+                "PA+T",
+                convert_to_0_if_nan(
+                    corr_table["num_past_accesses"].corr(
+                        corr_table["elapsed_time_till_next_access"], method="spearman"
+                    )
+                ),
+            )
+        )
+    for label_str in label_str_file:
+        label_str_file[label_str].close()
+
+    plot_heatmap(
+        csv_result_dir,
+        output_result_dir,
+        "correlation_output",
+        "correlation.pdf",
+        "Correlation",
+    )
+
+
+def plot_reuse_graphs(csv_result_dir, output_result_dir):
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="avg_reuse_interval_naccesses",
+        pdf_name="avg_reuse_interval_naccesses.pdf",
+        xlabel="",
+        ylabel="Percentage of accesses",
+        title="Average reuse interval",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="avg_reuse_interval",
+        pdf_name="avg_reuse_interval.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="Average reuse interval",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="access_reuse_interval",
+        pdf_name="reuse_interval.pdf",
+        xlabel="Seconds",
+        ylabel="Percentage of accesses",
+        title="Reuse interval",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="reuse_lifetime",
+        pdf_name="reuse_lifetime.pdf",
+        xlabel="Seconds",
+        ylabel="Percentage of blocks",
+        title="Reuse lifetime",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="reuse_blocks_timeline",
+        pdf_name="reuse_blocks_timeline.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="Reuse blocks timeline",
+        vertical=False,
+        legend=False,
+    )
+
+
+def plot_percentage_access_summary(csv_result_dir, output_result_dir):
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percentage_of_accesses_summary",
+        pdf_name="percentage_access.pdf",
+        xlabel="",
+        ylabel="Percentage of accesses",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percent_ref_keys",
+        pdf_name="percent_ref_keys.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percent_data_size_on_ref_keys",
+        pdf_name="percent_data_size_on_ref_keys.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="percent_accesses_on_ref_keys",
+        pdf_name="percent_accesses_on_ref_keys.pdf",
+        xlabel="",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="",
+    )
+
+
+def plot_access_count_summary(csv_result_dir, output_result_dir):
+    plot_stacked_bar_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_suffix="access_count_summary",
+        pdf_name="access_count_summary.pdf",
+        xlabel="Access count",
+        ylabel="Percentage of blocks",
+        title="",
+        vertical=True,
+        x_prefix="< ",
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="skewness",
+        pdf_name="skew.pdf",
+        xlabel="",
+        ylabel="Percentage of accesses",
+        title="Skewness",
+        vertical=True,
+        legend=False,
+    )
+
+
+def plot_miss_ratio_timeline(csv_result_dir, output_result_dir):
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_miss_ratio_timeline",
+        pdf_name="miss_ratio_timeline.pdf",
+        xlabel="Time",
+        ylabel="Miss Ratio (%)",
+        title="Miss ratio timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_miss_timeline",
+        pdf_name="miss_timeline.pdf",
+        xlabel="Time",
+        ylabel="# of misses ",
+        title="Miss timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_miss_timeline",
+        pdf_name="miss_timeline.pdf",
+        xlabel="Time",
+        ylabel="# of misses ",
+        title="Miss timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_policy_timeline",
+        pdf_name="policy_timeline.pdf",
+        xlabel="Time",
+        ylabel="# of times a policy is selected ",
+        title="Policy timeline",
+        vertical=False,
+        legend=True,
+    )
+    plot_line_charts(
+        csv_result_dir,
+        output_result_dir,
+        filename_prefix="",
+        filename_suffix="3600_policy_ratio_timeline",
+        pdf_name="policy_ratio_timeline.pdf",
+        xlabel="Time",
+        ylabel="Percentage of times a policy is selected ",
+        title="Policy timeline",
+        vertical=False,
+        legend=True,
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print(
+            "Must provide two arguments: \n"
+            "1) The directory that saves a list of "
+            "directories which contain block cache trace analyzer result files. \n"
+            "2) the directory to save plotted graphs. \n"
+        )
+        exit(1)
+    csv_result_dir = sys.argv[1]
+    output_result_dir = sys.argv[2]
+    print(
+        "Processing directory {} and save graphs to {}.".format(
+            csv_result_dir, output_result_dir
+        )
+    )
+    for csv_relative_dir in os.listdir(csv_result_dir):
+        csv_abs_dir = csv_result_dir + "/" + csv_relative_dir
+        result_dir = output_result_dir + "/" + csv_relative_dir
+        if not os.path.isdir(csv_abs_dir):
+            print("{} is not a directory".format(csv_abs_dir))
+            continue
+        print("Processing experiment dir: {}".format(csv_relative_dir))
+        if not os.path.exists(result_dir):
+            os.makedirs(result_dir)
+        plot_access_count_summary(csv_abs_dir, result_dir)
+        plot_timeline(csv_abs_dir, result_dir)
+        plot_miss_ratio_timeline(csv_result_dir, output_result_dir)
+        plot_correlation(csv_abs_dir, result_dir)
+        plot_reuse_graphs(csv_abs_dir, result_dir)
+        plot_percentage_access_summary(csv_abs_dir, result_dir)
+        plot_miss_stats_graphs(
+            csv_abs_dir,
+            result_dir,
+            file_prefix="",
+            file_suffix="mrc",
+            ylabel="Miss ratio (%)",
+            pdf_file_name="mrc",
+        )
+        plot_miss_stats_diff_lru_graphs(
+            csv_abs_dir,
+            result_dir,
+            file_prefix="",
+            file_suffix="mrc",
+            ylabel="Miss ratio (%)",
+            pdf_file_name="mrc_diff_lru",
+        )
+        # The following stats are only available in pysim.
+        for time_unit in ["1", "60", "3600"]:
+            plot_miss_stats_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="p95mb",
+                ylabel="p95 number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="p95mb_per{}_seconds".format(time_unit),
+            )
+            plot_miss_stats_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="avgmb",
+                ylabel="Average number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="avgmb_per{}_seconds".format(time_unit),
+            )
+            plot_miss_stats_diff_lru_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="p95mb",
+                ylabel="p95 number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit),
+            )
+            plot_miss_stats_diff_lru_graphs(
+                csv_abs_dir,
+                result_dir,
+                file_prefix="ml_{}_".format(time_unit),
+                file_suffix="avgmb",
+                ylabel="Average number of byte miss per {} seconds".format(time_unit),
+                pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit),
+            )
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
new file mode 100644
index 00000000000..fc9ec34705c
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -0,0 +1,716 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr,
+          "Please install gflags to run block_cache_trace_analyzer_test\n");
+  return 1;
+}
+#else
+
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace rocksdb {
+
+namespace {
+const uint64_t kBlockSize = 1024;
+const std::string kBlockKeyPrefix = "test-block-";
+const uint32_t kCFId = 0;
+const uint32_t kLevel = 1;
+const uint64_t kSSTStoringEvenKeys = 100;
+const uint64_t kSSTStoringOddKeys = 101;
+const std::string kRefKeyPrefix = "test-get-";
+const uint64_t kNumKeysInBlock = 1024;
+const int kMaxArgCount = 100;
+const size_t kArgBufferSize = 100000;
+}  // namespace
+
+class BlockCacheTracerTest : public testing::Test {
+ public:
+  BlockCacheTracerTest() {
+    test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
+    env_ = rocksdb::Env::Default();
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace";
+    block_cache_sim_config_path_ = test_path_ + "/block_cache_sim_config";
+    timeline_labels_ =
+        "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
+    reuse_distance_labels_ =
+        "block,all,cf,sst,level,bt,caller,cf_sst,cf_level,cf_bt,cf_caller";
+    reuse_distance_buckets_ = "1,1K,1M,1G";
+    reuse_interval_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
+    reuse_interval_buckets_ = "1,10,100,1000";
+    reuse_lifetime_labels_ = "block,all,cf,sst,level,bt,cf_sst,cf_level,cf_bt";
+    reuse_lifetime_buckets_ = "1,10,100,1000";
+    analyzing_callers_ = "Get,Iterator";
+    access_count_buckets_ = "2,3,4,5,10";
+    analyze_get_spatial_locality_labels_ = "all";
+    analyze_get_spatial_locality_buckets_ = "10,20,30,40,50,60,70,80,90,100";
+  }
+
+  ~BlockCacheTracerTest() override {
+    if (getenv("KEEP_DB")) {
+      printf("The trace file is still at %s\n", trace_file_path_.c_str());
+      return;
+    }
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  TableReaderCaller GetCaller(uint32_t key_id) {
+    uint32_t n = key_id % 5;
+    switch (n) {
+      case 0:
+        return TableReaderCaller::kPrefetch;
+      case 1:
+        return TableReaderCaller::kCompaction;
+      case 2:
+        return TableReaderCaller::kUserGet;
+      case 3:
+        return TableReaderCaller::kUserMultiGet;
+      case 4:
+        return TableReaderCaller::kUserIterator;
+    }
+    // This cannot happend.
+    assert(false);
+    return TableReaderCaller::kMaxBlockCacheLookupCaller;
+  }
+
+  void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
+                        TraceType block_type, uint32_t nblocks) {
+    assert(writer);
+    for (uint32_t i = 0; i < nblocks; i++) {
+      uint32_t key_id = from_key_id + i;
+      uint64_t timestamp = (key_id + 1) * kMicrosInSecond;
+      BlockCacheTraceRecord record;
+      record.block_type = block_type;
+      record.block_size = kBlockSize + key_id;
+      record.block_key = kBlockKeyPrefix + std::to_string(key_id);
+      record.access_timestamp = timestamp;
+      record.cf_id = kCFId;
+      record.cf_name = kDefaultColumnFamilyName;
+      record.caller = GetCaller(key_id);
+      record.level = kLevel;
+      if (key_id % 2 == 0) {
+        record.sst_fd_number = kSSTStoringEvenKeys;
+      } else {
+        record.sst_fd_number = kSSTStoringOddKeys;
+      }
+      record.is_cache_hit = Boolean::kFalse;
+      record.no_insert = Boolean::kFalse;
+      // Provide these fields for all block types.
+      // The writer should only write these fields for data blocks and the
+      // caller is either GET or MGET.
+      record.referenced_key =
+          kRefKeyPrefix + std::to_string(key_id) + std::string(8, 0);
+      record.referenced_key_exist_in_block = Boolean::kTrue;
+      record.num_keys_in_block = kNumKeysInBlock;
+      ASSERT_OK(writer->WriteBlockAccess(
+          record, record.block_key, record.cf_name, record.referenced_key));
+    }
+  }
+
+  void AssertBlockAccessInfo(
+      uint32_t key_id, TraceType type,
+      const std::map<std::string, BlockAccessInfo>& block_access_info_map) {
+    auto key_id_str = kBlockKeyPrefix + std::to_string(key_id);
+    ASSERT_TRUE(block_access_info_map.find(key_id_str) !=
+                block_access_info_map.end());
+    auto& block_access_info = block_access_info_map.find(key_id_str)->second;
+    ASSERT_EQ(1, block_access_info.num_accesses);
+    ASSERT_EQ(kBlockSize + key_id, block_access_info.block_size);
+    ASSERT_GT(block_access_info.first_access_time, 0);
+    ASSERT_GT(block_access_info.last_access_time, 0);
+    ASSERT_EQ(1, block_access_info.caller_num_access_map.size());
+    TableReaderCaller expected_caller = GetCaller(key_id);
+    ASSERT_TRUE(block_access_info.caller_num_access_map.find(expected_caller) !=
+                block_access_info.caller_num_access_map.end());
+    ASSERT_EQ(
+        1,
+        block_access_info.caller_num_access_map.find(expected_caller)->second);
+
+    if ((expected_caller == TableReaderCaller::kUserGet ||
+         expected_caller == TableReaderCaller::kUserMultiGet) &&
+        type == TraceType::kBlockTraceDataBlock) {
+      ASSERT_EQ(kNumKeysInBlock, block_access_info.num_keys);
+      ASSERT_EQ(1, block_access_info.key_num_access_map.size());
+      ASSERT_EQ(0, block_access_info.non_exist_key_num_access_map.size());
+      ASSERT_EQ(1, block_access_info.num_referenced_key_exist_in_block);
+    }
+  }
+
+  void RunBlockCacheTraceAnalyzer() {
+    std::vector<std::string> params = {
+        "./block_cache_trace_analyzer",
+        "-block_cache_trace_path=" + trace_file_path_,
+        "-block_cache_sim_config_path=" + block_cache_sim_config_path_,
+        "-block_cache_analysis_result_dir=" + test_path_,
+        "-print_block_size_stats",
+        "-print_access_count_stats",
+        "-print_data_block_access_count_stats",
+        "-cache_sim_warmup_seconds=0",
+        "-analyze_bottom_k_access_count_blocks=5",
+        "-analyze_top_k_access_count_blocks=5",
+        "-analyze_blocks_reuse_k_reuse_window=5",
+        "-timeline_labels=" + timeline_labels_,
+        "-reuse_distance_labels=" + reuse_distance_labels_,
+        "-reuse_distance_buckets=" + reuse_distance_buckets_,
+        "-reuse_interval_labels=" + reuse_interval_labels_,
+        "-reuse_interval_buckets=" + reuse_interval_buckets_,
+        "-reuse_lifetime_labels=" + reuse_lifetime_labels_,
+        "-reuse_lifetime_buckets=" + reuse_lifetime_buckets_,
+        "-analyze_callers=" + analyzing_callers_,
+        "-access_count_buckets=" + access_count_buckets_,
+        "-analyze_get_spatial_locality_labels=" +
+            analyze_get_spatial_locality_labels_,
+        "-analyze_get_spatial_locality_buckets=" +
+            analyze_get_spatial_locality_buckets_,
+        "-analyze_correlation_coefficients_labels=all",
+        "-skew_labels=all",
+        "-skew_buckets=10,50,100"};
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+    for (const auto& arg : params) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+    ASSERT_EQ(0, rocksdb::block_cache_trace_analyzer_tool(argc, argv));
+  }
+
+  Env* env_;
+  EnvOptions env_options_;
+  std::string block_cache_sim_config_path_;
+  std::string trace_file_path_;
+  std::string test_path_;
+  std::string timeline_labels_;
+  std::string reuse_distance_labels_;
+  std::string reuse_distance_buckets_;
+  std::string reuse_interval_labels_;
+  std::string reuse_interval_buckets_;
+  std::string reuse_lifetime_labels_;
+  std::string reuse_lifetime_buckets_;
+  std::string analyzing_callers_;
+  std::string access_count_buckets_;
+  std::string analyze_get_spatial_locality_labels_;
+  std::string analyze_get_spatial_locality_buckets_;
+};
+
+TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
+  {
+    // Generate a trace file.
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    ASSERT_OK(writer.WriteHeader());
+    WriteBlockAccess(&writer, 0, TraceType::kBlockTraceDataBlock, 50);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Generate a cache sim config.
+    std::string config = "lru,1,0,1K,1M,1G";
+    std::ofstream out(block_cache_sim_config_path_);
+    ASSERT_TRUE(out.is_open());
+    out << config << std::endl;
+    out.close();
+  }
+  RunBlockCacheTraceAnalyzer();
+  {
+    // Validate the cache miss ratios.
+    std::vector<uint64_t> expected_capacities{1024, 1024 * 1024,
+                                              1024 * 1024 * 1024};
+    const std::string mrc_path = test_path_ + "/49_50_mrc";
+    std::ifstream infile(mrc_path);
+    uint32_t config_index = 0;
+    std::string line;
+    // Read header.
+    ASSERT_TRUE(getline(infile, line));
+    while (getline(infile, line)) {
+      std::stringstream ss(line);
+      std::vector<std::string> result_strs;
+      while (ss.good()) {
+        std::string substr;
+        getline(ss, substr, ',');
+        result_strs.push_back(substr);
+      }
+      ASSERT_EQ(6, result_strs.size());
+      ASSERT_LT(config_index, expected_capacities.size());
+      ASSERT_EQ("lru", result_strs[0]);  // cache_name
+      ASSERT_EQ("1", result_strs[1]);    // num_shard_bits
+      ASSERT_EQ("0", result_strs[2]);    // ghost_cache_capacity
+      ASSERT_EQ(std::to_string(expected_capacities[config_index]),
+                result_strs[3]);              // cache_capacity
+      ASSERT_EQ("100.0000", result_strs[4]);  // miss_ratio
+      ASSERT_EQ("50", result_strs[5]);        // number of accesses.
+      config_index++;
+    }
+    ASSERT_EQ(expected_capacities.size(), config_index);
+    infile.close();
+    ASSERT_OK(env_->DeleteFile(mrc_path));
+
+    const std::vector<std::string> time_units{"1", "60", "3600"};
+    expected_capacities.push_back(port::kMaxUint64);
+    for (auto const& expected_capacity : expected_capacities) {
+      for (auto const& time_unit : time_units) {
+        const std::string miss_ratio_timeline_path =
+            test_path_ + "/" + std::to_string(expected_capacity) + "_" +
+            time_unit + "_miss_ratio_timeline";
+        std::ifstream mrt_file(miss_ratio_timeline_path);
+        // Read header.
+        ASSERT_TRUE(getline(mrt_file, line));
+        ASSERT_TRUE(getline(mrt_file, line));
+        std::stringstream ss(line);
+        bool read_header = false;
+        while (ss.good()) {
+          std::string substr;
+          getline(ss, substr, ',');
+          if (!read_header) {
+            if (expected_capacity == port::kMaxUint64) {
+              ASSERT_EQ("trace", substr);
+            } else {
+              ASSERT_EQ("lru-1-0", substr);
+            }
+            read_header = true;
+            continue;
+          }
+          ASSERT_DOUBLE_EQ(100.0, ParseDouble(substr));
+        }
+        ASSERT_FALSE(getline(mrt_file, line));
+        mrt_file.close();
+        ASSERT_OK(env_->DeleteFile(miss_ratio_timeline_path));
+      }
+      for (auto const& time_unit : time_units) {
+        const std::string miss_timeline_path =
+            test_path_ + "/" + std::to_string(expected_capacity) + "_" +
+            time_unit + "_miss_timeline";
+        std::ifstream mt_file(miss_timeline_path);
+        // Read header.
+        ASSERT_TRUE(getline(mt_file, line));
+        ASSERT_TRUE(getline(mt_file, line));
+        std::stringstream ss(line);
+        uint32_t num_misses = 0;
+        while (ss.good()) {
+          std::string substr;
+          getline(ss, substr, ',');
+          if (num_misses == 0) {
+            if (expected_capacity == port::kMaxUint64) {
+              ASSERT_EQ("trace", substr);
+            } else {
+              ASSERT_EQ("lru-1-0", substr);
+            }
+            num_misses++;
+            continue;
+          }
+          num_misses += ParseInt(substr);
+        }
+        ASSERT_EQ(51, num_misses);
+        ASSERT_FALSE(getline(mt_file, line));
+        mt_file.close();
+        ASSERT_OK(env_->DeleteFile(miss_timeline_path));
+      }
+    }
+  }
+  {
+    // Validate the skewness csv file.
+    const std::string skewness_file_path = test_path_ + "/all_skewness";
+    std::ifstream skew_file(skewness_file_path);
+    // Read header.
+    std::string line;
+    ASSERT_TRUE(getline(skew_file, line));
+    std::stringstream ss(line);
+    double sum_percent = 0;
+    while (getline(skew_file, line)) {
+      std::stringstream ss_naccess(line);
+      std::string substr;
+      bool read_label = false;
+      while (ss_naccess.good()) {
+        ASSERT_TRUE(getline(ss_naccess, substr, ','));
+        if (!read_label) {
+          read_label = true;
+          continue;
+        }
+        sum_percent += ParseDouble(substr);
+      }
+    }
+    ASSERT_EQ(100.0, sum_percent);
+    ASSERT_FALSE(getline(skew_file, line));
+    skew_file.close();
+    ASSERT_OK(env_->DeleteFile(skewness_file_path));
+  }
+  {
+    // Validate the timeline csv files.
+    const std::vector<std::string> time_units{"_60", "_3600"};
+    const std::vector<std::string> user_access_only_flags{"user_access_only_",
+                                                          "all_access_"};
+    for (auto const& user_access_only : user_access_only_flags) {
+      for (auto const& unit : time_units) {
+        std::stringstream ss(timeline_labels_);
+        while (ss.good()) {
+          std::string l;
+          ASSERT_TRUE(getline(ss, l, ','));
+          if (l.find("block") == std::string::npos) {
+            if (user_access_only != "all_access_") {
+              continue;
+            }
+          }
+          const std::string timeline_file = test_path_ + "/" +
+                                            user_access_only + l + unit +
+                                            "_access_timeline";
+          std::ifstream infile(timeline_file);
+          std::string line;
+          const uint64_t expected_naccesses = 50;
+          const uint64_t expected_user_accesses = 30;
+          ASSERT_TRUE(getline(infile, line)) << timeline_file;
+          uint32_t naccesses = 0;
+          while (getline(infile, line)) {
+            std::stringstream ss_naccess(line);
+            std::string substr;
+            bool read_label = false;
+            while (ss_naccess.good()) {
+              ASSERT_TRUE(getline(ss_naccess, substr, ','));
+              if (!read_label) {
+                read_label = true;
+                continue;
+              }
+              naccesses += ParseUint32(substr);
+            }
+          }
+          if (user_access_only == "user_access_only_") {
+            ASSERT_EQ(expected_user_accesses, naccesses) << timeline_file;
+          } else {
+            ASSERT_EQ(expected_naccesses, naccesses) << timeline_file;
+          }
+          ASSERT_OK(env_->DeleteFile(timeline_file));
+        }
+      }
+    }
+  }
+  {
+    // Validate the reuse_interval and reuse_distance csv files.
+    std::map<std::string, std::string> test_reuse_csv_files;
+    test_reuse_csv_files["_access_reuse_interval"] = reuse_interval_labels_;
+    test_reuse_csv_files["_reuse_distance"] = reuse_distance_labels_;
+    test_reuse_csv_files["_reuse_lifetime"] = reuse_lifetime_labels_;
+    test_reuse_csv_files["_avg_reuse_interval"] = reuse_interval_labels_;
+    test_reuse_csv_files["_avg_reuse_interval_naccesses"] =
+        reuse_interval_labels_;
+    for (auto const& test : test_reuse_csv_files) {
+      const std::string& file_suffix = test.first;
+      const std::string& labels = test.second;
+      const uint32_t expected_num_rows = 5;
+      std::stringstream ss(labels);
+      while (ss.good()) {
+        std::string l;
+        ASSERT_TRUE(getline(ss, l, ','));
+        const std::string reuse_csv_file = test_path_ + "/" + l + file_suffix;
+        std::ifstream infile(reuse_csv_file);
+        std::string line;
+        ASSERT_TRUE(getline(infile, line));
+        double npercentage = 0;
+        uint32_t nrows = 0;
+        while (getline(infile, line)) {
+          std::stringstream ss_naccess(line);
+          bool label_read = false;
+          nrows++;
+          while (ss_naccess.good()) {
+            std::string substr;
+            ASSERT_TRUE(getline(ss_naccess, substr, ','));
+            if (!label_read) {
+              label_read = true;
+              continue;
+            }
+            npercentage += ParseDouble(substr);
+          }
+        }
+        ASSERT_EQ(expected_num_rows, nrows);
+        if ("_reuse_lifetime" == test.first ||
+            "_avg_reuse_interval" == test.first ||
+            "_avg_reuse_interval_naccesses" == test.first) {
+          ASSERT_EQ(100, npercentage) << reuse_csv_file;
+        } else {
+          ASSERT_LT(npercentage, 0);
+        }
+        ASSERT_OK(env_->DeleteFile(reuse_csv_file));
+      }
+    }
+  }
+
+  {
+    // Validate the percentage of accesses summary.
+    const std::string percent_access_summary_file =
+        test_path_ + "/percentage_of_accesses_summary";
+    std::ifstream infile(percent_access_summary_file);
+    std::string line;
+    ASSERT_TRUE(getline(infile, line));
+    std::set<std::string> callers;
+    std::set<std::string> expected_callers{"Get", "MultiGet", "Iterator",
+                                           "Prefetch", "Compaction"};
+    while (getline(infile, line)) {
+      std::stringstream caller_percent(line);
+      std::string caller;
+      ASSERT_TRUE(getline(caller_percent, caller, ','));
+      std::string percent;
+      ASSERT_TRUE(getline(caller_percent, percent, ','));
+      ASSERT_FALSE(caller_percent.good());
+      callers.insert(caller);
+      ASSERT_EQ(20, ParseDouble(percent));
+    }
+    ASSERT_EQ(expected_callers.size(), callers.size());
+    for (auto caller : callers) {
+      ASSERT_TRUE(expected_callers.find(caller) != expected_callers.end());
+    }
+    ASSERT_OK(env_->DeleteFile(percent_access_summary_file));
+  }
+  {
+    // Validate the percentage of accesses summary by analyzing callers.
+    std::stringstream analyzing_callers(analyzing_callers_);
+    while (analyzing_callers.good()) {
+      std::string caller;
+      ASSERT_TRUE(getline(analyzing_callers, caller, ','));
+      std::vector<std::string> breakdowns{"level", "bt"};
+      for (auto breakdown : breakdowns) {
+        const std::string file_name = test_path_ + "/" + caller + "_" +
+                                      breakdown +
+                                      "_percentage_of_accesses_summary";
+        std::ifstream infile(file_name);
+        std::string line;
+        ASSERT_TRUE(getline(infile, line));
+        double sum = 0;
+        while (getline(infile, line)) {
+          std::stringstream label_percent(line);
+          std::string label;
+          ASSERT_TRUE(getline(label_percent, label, ','));
+          std::string percent;
+          ASSERT_TRUE(getline(label_percent, percent, ','));
+          ASSERT_FALSE(label_percent.good());
+          sum += ParseDouble(percent);
+        }
+        ASSERT_EQ(100, sum);
+        ASSERT_OK(env_->DeleteFile(file_name));
+      }
+    }
+  }
+  const std::vector<std::string> access_types{"user_access_only", "all_access"};
+  const std::vector<std::string> prefix{"bt", "cf"};
+  for (auto const& pre : prefix) {
+    for (auto const& access_type : access_types) {
+      {
+        // Validate the access count summary.
+        const std::string bt_access_count_summary = test_path_ + "/" + pre +
+                                                    "_" + access_type +
+                                                    "_access_count_summary";
+        std::ifstream infile(bt_access_count_summary);
+        std::string line;
+        ASSERT_TRUE(getline(infile, line));
+        double sum_percent = 0;
+        while (getline(infile, line)) {
+          std::stringstream bt_percent(line);
+          std::string bt;
+          ASSERT_TRUE(getline(bt_percent, bt, ','));
+          std::string percent;
+          ASSERT_TRUE(getline(bt_percent, percent, ','));
+          sum_percent += ParseDouble(percent);
+        }
+        ASSERT_EQ(100.0, sum_percent);
+        ASSERT_OK(env_->DeleteFile(bt_access_count_summary));
+      }
+    }
+  }
+  for (auto const& access_type : access_types) {
+    std::vector<std::string> block_types{"Index", "Data", "Filter"};
+    for (auto block_type : block_types) {
+      // Validate reuse block timeline.
+      const std::string reuse_blocks_timeline = test_path_ + "/" + block_type +
+                                                "_" + access_type +
+                                                "_5_reuse_blocks_timeline";
+      std::ifstream infile(reuse_blocks_timeline);
+      std::string line;
+      ASSERT_TRUE(getline(infile, line)) << reuse_blocks_timeline;
+      uint32_t index = 0;
+      while (getline(infile, line)) {
+        std::stringstream timeline(line);
+        bool start_time = false;
+        double sum = 0;
+        while (timeline.good()) {
+          std::string value;
+          ASSERT_TRUE(getline(timeline, value, ','));
+          if (!start_time) {
+            start_time = true;
+            continue;
+          }
+          sum += ParseDouble(value);
+        }
+        index++;
+        ASSERT_LT(sum, 100.0 * index + 1) << reuse_blocks_timeline;
+      }
+      ASSERT_OK(env_->DeleteFile(reuse_blocks_timeline));
+    }
+  }
+
+  std::stringstream ss(analyze_get_spatial_locality_labels_);
+  while (ss.good()) {
+    std::string l;
+    ASSERT_TRUE(getline(ss, l, ','));
+    const std::vector<std::string> spatial_locality_files{
+        "_percent_ref_keys", "_percent_accesses_on_ref_keys",
+        "_percent_data_size_on_ref_keys"};
+    for (auto const& spatial_locality_file : spatial_locality_files) {
+      const std::string filename = test_path_ + "/" + l + spatial_locality_file;
+      std::ifstream infile(filename);
+      std::string line;
+      ASSERT_TRUE(getline(infile, line));
+      double sum_percent = 0;
+      uint32_t nrows = 0;
+      while (getline(infile, line)) {
+        std::stringstream bt_percent(line);
+        std::string bt;
+        ASSERT_TRUE(getline(bt_percent, bt, ','));
+        std::string percent;
+        ASSERT_TRUE(getline(bt_percent, percent, ','));
+        sum_percent += ParseDouble(percent);
+        nrows++;
+      }
+      ASSERT_EQ(11, nrows);
+      ASSERT_EQ(100.0, sum_percent);
+      ASSERT_OK(env_->DeleteFile(filename));
+    }
+  }
+  ASSERT_OK(env_->DeleteFile(block_cache_sim_config_path_));
+}
+
+TEST_F(BlockCacheTracerTest, MixedBlocks) {
+  {
+    // Generate a trace file containing a mix of blocks.
+    // It contains two SST files with 25 blocks of odd numbered block_key in
+    // kSSTStoringOddKeys and 25 blocks of even numbered blocks_key in
+    // kSSTStoringEvenKeys.
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    ASSERT_OK(writer.WriteHeader());
+    // Write blocks of different types.
+    WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock,
+                     10);
+    WriteBlockAccess(&writer, 10, TraceType::kBlockTraceDataBlock, 10);
+    WriteBlockAccess(&writer, 20, TraceType::kBlockTraceFilterBlock, 10);
+    WriteBlockAccess(&writer, 30, TraceType::kBlockTraceIndexBlock, 10);
+    WriteBlockAccess(&writer, 40, TraceType::kBlockTraceRangeDeletionBlock, 10);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
+    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    // Read blocks.
+    BlockCacheTraceAnalyzer analyzer(
+        trace_file_path_,
+        /*output_miss_ratio_curve_path=*/"",
+        /*human_readable_trace_file_path=*/"",
+        /*compute_reuse_distance=*/true,
+        /*mrc_only=*/false,
+        /*is_block_cache_human_readable_trace=*/false,
+        /*simulator=*/nullptr);
+    // The analyzer ends when it detects an incomplete access record.
+    ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze());
+    const uint64_t expected_num_cfs = 1;
+    std::vector<uint64_t> expected_fds{kSSTStoringOddKeys, kSSTStoringEvenKeys};
+    const std::vector<TraceType> expected_types{
+        TraceType::kBlockTraceUncompressionDictBlock,
+        TraceType::kBlockTraceDataBlock, TraceType::kBlockTraceFilterBlock,
+        TraceType::kBlockTraceIndexBlock,
+        TraceType::kBlockTraceRangeDeletionBlock};
+    const uint64_t expected_num_keys_per_type = 5;
+
+    auto& stats = analyzer.TEST_cf_aggregates_map();
+    ASSERT_EQ(expected_num_cfs, stats.size());
+    ASSERT_TRUE(stats.find(kDefaultColumnFamilyName) != stats.end());
+    auto& cf_stats = stats.find(kDefaultColumnFamilyName)->second;
+    ASSERT_EQ(expected_fds.size(), cf_stats.fd_aggregates_map.size());
+    for (auto fd_id : expected_fds) {
+      ASSERT_TRUE(cf_stats.fd_aggregates_map.find(fd_id) !=
+                  cf_stats.fd_aggregates_map.end());
+      ASSERT_EQ(kLevel, cf_stats.fd_aggregates_map.find(fd_id)->second.level);
+      auto& block_type_aggregates_map = cf_stats.fd_aggregates_map.find(fd_id)
+                                            ->second.block_type_aggregates_map;
+      ASSERT_EQ(expected_types.size(), block_type_aggregates_map.size());
+      uint32_t key_id = 0;
+      for (auto type : expected_types) {
+        ASSERT_TRUE(block_type_aggregates_map.find(type) !=
+                    block_type_aggregates_map.end());
+        auto& block_access_info_map =
+            block_type_aggregates_map.find(type)->second.block_access_info_map;
+        // Each block type has 5 blocks.
+        ASSERT_EQ(expected_num_keys_per_type, block_access_info_map.size());
+        for (uint32_t i = 0; i < 10; i++) {
+          // Verify that odd numbered blocks are stored in kSSTStoringOddKeys
+          // and even numbered blocks are stored in kSSTStoringEvenKeys.
+          auto key_id_str = kBlockKeyPrefix + std::to_string(key_id);
+          if (fd_id == kSSTStoringOddKeys) {
+            if (key_id % 2 == 1) {
+              AssertBlockAccessInfo(key_id, type, block_access_info_map);
+            } else {
+              ASSERT_TRUE(block_access_info_map.find(key_id_str) ==
+                          block_access_info_map.end());
+            }
+          } else {
+            if (key_id % 2 == 1) {
+              ASSERT_TRUE(block_access_info_map.find(key_id_str) ==
+                          block_access_info_map.end());
+            } else {
+              AssertBlockAccessInfo(key_id, type, block_access_info_map);
+            }
+          }
+          key_id++;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif  // GFLAG
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "block_cache_trace_analyzer_test is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+#endif  // ROCKSDB_LITE
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc
new file mode 100644
index 00000000000..63382cf8c22
--- /dev/null
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else  // GFLAGS
+#include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
+int main(int argc, char** argv) {
+  return rocksdb::block_cache_trace_analyzer_tool(argc, argv);
+}
+#endif  // GFLAGS
+#else   // ROCKSDB_LITE
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 098f0d555a9..98c2bb5c2a3 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -54,10 +54,10 @@ with open('${sorted_input_data}', 'w') as f:
 EOF
 
 declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
-declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
-declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb")
+declare -a forward_compatible_checkout_objs=("4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
+declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
 declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
-declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb")
+declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
 
 generate_db()
 {
diff --git a/tools/db_bench.cc b/tools/db_bench.cc
index 634bbba30ac..1ad77295fa6 100644
--- a/tools/db_bench.cc
+++ b/tools/db_bench.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index f0f1d879b96..d0540cd0cfc 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -7,10 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #ifdef GFLAGS
 #ifdef NUMA
 #include <numa.h>
@@ -20,12 +16,12 @@
 #include <unistd.h>
 #endif
 #include <fcntl.h>
-#include <inttypes.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstddef>
 #include <memory>
@@ -33,7 +29,7 @@
 #include <thread>
 #include <unordered_map>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/malloc_stats.h"
 #include "db/version_set.h"
 #include "hdfs/env_hdfs.h"
@@ -53,6 +49,7 @@
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/stats_history.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/options_util.h"
@@ -60,6 +57,8 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/write_batch.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
 #include "util/cast_util.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
@@ -68,12 +67,11 @@
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
-#include "util/testutil.h"
-#include "util/transaction_test_util.h"
 #include "util/xxhash.h"
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/bytesxor.h"
+#include "utilities/merge_operators/sortlist.h"
 #include "utilities/persistent_cache/block_cache_tier.h"
 
 #ifdef OS_WIN
@@ -105,6 +103,7 @@ DEFINE_string(
     "multireadrandom,"
     "mixgraph,"
     "readseq,"
+    "readtorowcache,"
     "readtocache,"
     "readreverse,"
     "readwhilewriting,"
@@ -123,7 +122,8 @@ DEFINE_string(
     "fillseekseq,"
     "randomtransaction,"
     "randomreplacekeys,"
-    "timeseries",
+    "timeseries,"
+    "getmergeoperands",
 
     "Comma-separated list of operations to run in the specified"
     " order. Available benchmarks:\n"
@@ -193,7 +193,13 @@ DEFINE_string(
     "\tlevelstats  -- Print the number of files and bytes per level\n"
     "\tsstables    -- Print sstable info\n"
     "\theapprofile -- Dump a heap profile (if supported by this port)\n"
-    "\treplay      -- replay the trace file specified with trace_file\n");
+    "\treplay      -- replay the trace file specified with trace_file\n"
+    "\tgetmergeoperands -- Insert lots of merge records which are a list of "
+    "sorted ints for a key and then compare performance of lookup for another "
+    "key "
+    "by doing a Get followed by binary searching in the large sorted list vs "
+    "doing a GetMergeOperands and binary searching in the operands which are"
+    "sorted sub-lists. The MergeOperator used is sortlist.h\n");
 
 DEFINE_int64(num, 1000000, "Number of key/values to place in database");
 
@@ -335,6 +341,20 @@ DEFINE_int32(max_write_buffer_number_to_maintain,
              "after they are flushed.  If this value is set to -1, "
              "'max_write_buffer_number' will be used.");
 
+DEFINE_int64(max_write_buffer_size_to_maintain,
+             rocksdb::Options().max_write_buffer_size_to_maintain,
+             "The total maximum size of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
 DEFINE_int32(max_background_jobs,
              rocksdb::Options().max_background_jobs,
              "The maximum number of concurrent background jobs that can occur "
@@ -737,8 +757,9 @@ DEFINE_bool(blob_db_is_fifo, false, "Enable FIFO eviction strategy in BlobDB.");
 DEFINE_uint64(blob_db_max_db_size, 0,
               "Max size limit of the directory where blob files are stored.");
 
-DEFINE_uint64(blob_db_max_ttl_range, 86400,
-              "TTL range to generate BlobDB data (in seconds).");
+DEFINE_uint64(
+    blob_db_max_ttl_range, 0,
+    "TTL range to generate BlobDB data (in seconds). 0 means no TTL.");
 
 DEFINE_uint64(blob_db_ttl_range_secs, 3600,
               "TTL bucket size to use when creating blob files.");
@@ -752,6 +773,19 @@ DEFINE_uint64(blob_db_bytes_per_sync, 0, "Bytes to sync blob file at.");
 DEFINE_uint64(blob_db_file_size, 256 * 1024 * 1024,
               "Target size of each blob file.");
 
+// Secondary DB instance Options
+DEFINE_bool(use_secondary_db, false,
+            "Open a RocksDB secondary instance. A primary instance can be "
+            "running in another db_bench process.");
+
+DEFINE_string(secondary_path, "",
+              "Path to a directory used by the secondary instance to store "
+              "private files, e.g. info log.");
+
+DEFINE_int32(secondary_update_interval, 5,
+             "Secondary instance attempts to catch up with the primary every "
+             "secondary_update_interval seconds.");
+
 #endif  // ROCKSDB_LITE
 
 DEFINE_bool(report_bg_io_stats, false,
@@ -762,6 +796,21 @@ DEFINE_bool(use_stderr_info_logger, false,
 
 DEFINE_string(trace_file, "", "Trace workload to a file. ");
 
+DEFINE_int32(trace_replay_fast_forward, 1,
+             "Fast forward trace replay, must >= 1. ");
+DEFINE_int32(block_cache_trace_sampling_frequency, 1,
+             "Block cache trace sampling frequency, termed s. It uses spatial "
+             "downsampling and samples accesses to one out of s blocks.");
+DEFINE_int64(
+    block_cache_trace_max_trace_file_size_in_bytes,
+    uint64_t{64} * 1024 * 1024 * 1024,
+    "The maximum block cache trace file size in bytes. Block cache accesses "
+    "will not be logged if the trace file size exceeds this threshold. Default "
+    "is 64 GB.");
+DEFINE_string(block_cache_trace_file, "", "Block cache trace file path.");
+DEFINE_int32(trace_replay_threads, 1,
+             "The number of threads to replay, must >=1.");
+
 static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
 
@@ -840,6 +889,9 @@ DEFINE_string(env_uri, "", "URI for registry Env lookup. Mutually exclusive"
 #endif  // ROCKSDB_LITE
 DEFINE_string(hdfs, "", "Name of hdfs environment. Mutually exclusive with"
               " --env_uri.");
+
+static std::shared_ptr<rocksdb::Env> env_guard;
+
 static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
 
 DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when "
@@ -891,6 +943,9 @@ DEFINE_uint64(delayed_write_rate, 8388608u,
 DEFINE_bool(enable_pipelined_write, true,
             "Allow WAL and memtable writes to be pipelined");
 
+DEFINE_bool(unordered_write, false,
+            "Allow WAL and memtable writes to be pipelined");
+
 DEFINE_bool(allow_concurrent_memtable_write, true,
             "Allow multi-writers to update mem tables in parallel.");
 
@@ -950,6 +1005,22 @@ DEFINE_uint64(
     "is the global rate in bytes/second.");
 
 // the parameters of mix_graph
+DEFINE_double(keyrange_dist_a, 0.0,
+              "The parameter 'a' of prefix average access distribution "
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_b, 0.0,
+              "The parameter 'b' of prefix average access distribution "
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_c, 0.0,
+              "The parameter 'c' of prefix average access distribution"
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_double(keyrange_dist_d, 0.0,
+              "The parameter 'd' of prefix average access distribution"
+              "f(x)=a*exp(b*x)+c*exp(d*x)");
+DEFINE_int64(keyrange_num, 1,
+             "The number of key ranges that are in the same prefix "
+             "group, each prefix range will have its key acccess "
+             "distribution");
 DEFINE_double(key_dist_a, 0.0,
               "The parameter 'a' of key access distribution model "
               "f(x)=a*x^b");
@@ -1120,6 +1191,8 @@ DEFINE_uint64(stats_dump_period_sec, rocksdb::Options().stats_dump_period_sec,
 DEFINE_uint64(stats_persist_period_sec,
               rocksdb::Options().stats_persist_period_sec,
               "Gap between persisting stats in seconds");
+DEFINE_bool(persist_stats_to_disk, rocksdb::Options().persist_stats_to_disk,
+            "whether to persist stats to disk");
 DEFINE_uint64(stats_history_buffer_size,
               rocksdb::Options().stats_history_buffer_size,
               "Max number of stats snapshots to keep in memory");
@@ -1519,7 +1592,6 @@ class ReporterAgent {
  private:
   std::string Header() const { return "secs_elapsed,interval_qps"; }
   void SleepAndReport() {
-    uint64_t kMicrosInSecond = 1000 * 1000;
     auto time_started = env_->NowMicros();
     while (true) {
       {
@@ -1685,7 +1757,7 @@ class Stats {
         "ElapsedTime", "Stage", "State", "OperationProperties");
 
     int64_t current_time = 0;
-    Env::Default()->GetCurrentTime(&current_time);
+    FLAGS_env->GetCurrentTime(&current_time);
     for (auto ts : thread_list) {
       fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
           ts.thread_id,
@@ -2066,6 +2138,7 @@ class Benchmark {
   Options open_options_;  // keep options around to properly destroy db later
 #ifndef ROCKSDB_LITE
   TraceOptions trace_options_;
+  TraceOptions block_cache_trace_options_;
 #endif
   int64_t reads_;
   int64_t deletes_;
@@ -2102,10 +2175,10 @@ class Benchmark {
       cv_.SignalAll();
     }
 
-    bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+    bool WaitForRecovery(uint64_t abs_time_us) {
       InstrumentedMutexLock l(&mutex_);
       if (!recovery_complete_) {
-        cv_.Wait(/*abs_time_us*/);
+        cv_.TimedWait(abs_time_us);
       }
       if (recovery_complete_) {
         recovery_complete_ = false;
@@ -2368,16 +2441,17 @@ class Benchmark {
       return nullptr;
     }
     if (FLAGS_use_clock_cache) {
-      auto cache = NewClockCache((size_t)capacity, FLAGS_cache_numshardbits);
+      auto cache = NewClockCache(static_cast<size_t>(capacity),
+                                 FLAGS_cache_numshardbits);
       if (!cache) {
         fprintf(stderr, "Clock cache not supported.");
         exit(1);
       }
       return cache;
     } else {
-      return NewLRUCache((size_t)capacity, FLAGS_cache_numshardbits,
-                         false /*strict_capacity_limit*/,
-                         FLAGS_cache_high_pri_pool_ratio);
+      return NewLRUCache(
+          static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
+          false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio);
     }
   }
 
@@ -2428,7 +2502,7 @@ class Benchmark {
                 "at the same time");
         exit(1);
       }
-      FLAGS_env = new ReportFileOpEnv(rocksdb::Env::Default());
+      FLAGS_env = new ReportFileOpEnv(FLAGS_env);
     }
 
     if (FLAGS_prefix_size > FLAGS_key_size) {
@@ -2445,6 +2519,7 @@ class Benchmark {
     }
     if (!FLAGS_use_existing_db) {
       Options options;
+      options.env = FLAGS_env;
       if (!FLAGS_wal_dir.empty()) {
         options.wal_dir = FLAGS_wal_dir;
       }
@@ -2565,36 +2640,38 @@ class Benchmark {
     return base_name + ToString(id);
   }
 
-void VerifyDBFromDB(std::string& truth_db_name) {
-  DBWithColumnFamilies truth_db;
-  auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
-  if (!s.ok()) {
-    fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-    exit(1);
-  }
-  ReadOptions ro;
-  ro.total_order_seek = true;
-  std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
-  std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
-  // Verify that all the key/values in truth_db are retrivable in db with ::Get
-  fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
-  for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
+  void VerifyDBFromDB(std::string& truth_db_name) {
+    DBWithColumnFamilies truth_db;
+    auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
+    std::unique_ptr<Iterator> db_iter(db_.db->NewIterator(ro));
+    // Verify that all the key/values in truth_db are retrivable in db with
+    // ::Get
+    fprintf(stderr, "Verifying db >= truth_db with ::Get...\n");
+    for (truth_iter->SeekToFirst(); truth_iter->Valid(); truth_iter->Next()) {
       std::string value;
       s = db_.db->Get(ro, truth_iter->key(), &value);
       assert(s.ok());
       // TODO(myabandeh): provide debugging hints
       assert(Slice(value) == truth_iter->value());
+    }
+    // Verify that the db iterator does not give any extra key/value
+    fprintf(stderr, "Verifying db == truth_db...\n");
+    for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid();
+         db_iter->Next(), truth_iter->Next()) {
+      assert(truth_iter->Valid());
+      assert(truth_iter->value() == db_iter->value());
+    }
+    // No more key should be left unchecked in truth_db
+    assert(!truth_iter->Valid());
+    fprintf(stderr, "...Verified\n");
   }
-  // Verify that the db iterator does not give any extra key/value
-  fprintf(stderr, "Verifying db == truth_db...\n");
-  for (db_iter->SeekToFirst(), truth_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next(), truth_iter->Next()) {
-    assert(truth_iter->Valid());
-    assert(truth_iter->value() == db_iter->value());
-  }
-  // No more key should be left unchecked in truth_db
-  assert(!truth_iter->Valid());
-  fprintf(stderr, "...Verified\n");
-}
 
   void Run() {
     if (!SanityCheck()) {
@@ -2717,6 +2794,14 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         method = &Benchmark::WriteRandom;
       } else if (name == "readseq") {
         method = &Benchmark::ReadSequential;
+      } else if (name == "readtorowcache") {
+        if (!FLAGS_use_existing_keys || !FLAGS_row_cache_size) {
+          fprintf(stderr,
+                  "Please set use_existing_keys to true and specify a "
+                  "row cache size in readtorowcache benchmark\n");
+          exit(1);
+        }
+        method = &Benchmark::ReadToRowCache;
       } else if (name == "readtocache") {
         method = &Benchmark::ReadSequential;
         num_threads = 1;
@@ -2836,6 +2921,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         PrintStats("rocksdb.levelstats");
       } else if (name == "sstables") {
         PrintStats("rocksdb.sstables");
+      } else if (name == "stats_history") {
+        PrintStatsHistory();
       } else if (name == "replay") {
         if (num_threads > 1) {
           fprintf(stderr, "Multi-threaded replay is not yet supported\n");
@@ -2846,6 +2933,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
           exit(1);
         }
         method = &Benchmark::Replay;
+      } else if (name == "getmergeoperands") {
+        method = &Benchmark::GetMergeOperands;
       } else if (!name.empty()) {  // No error message for empty name
         fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
         exit(1);
@@ -2900,6 +2989,47 @@ void VerifyDBFromDB(std::string& truth_db_name) {
           fprintf(stdout, "Tracing the workload to: [%s]\n",
                   FLAGS_trace_file.c_str());
         }
+        // Start block cache tracing.
+        if (!FLAGS_block_cache_trace_file.empty()) {
+          // Sanity checks.
+          if (FLAGS_block_cache_trace_sampling_frequency <= 0) {
+            fprintf(stderr,
+                    "Block cache trace sampling frequency must be higher than "
+                    "0.\n");
+            exit(1);
+          }
+          if (FLAGS_block_cache_trace_max_trace_file_size_in_bytes <= 0) {
+            fprintf(stderr,
+                    "The maximum file size for block cache tracing must be "
+                    "higher than 0.\n");
+            exit(1);
+          }
+          block_cache_trace_options_.max_trace_file_size =
+              FLAGS_block_cache_trace_max_trace_file_size_in_bytes;
+          block_cache_trace_options_.sampling_frequency =
+              FLAGS_block_cache_trace_sampling_frequency;
+          std::unique_ptr<TraceWriter> block_cache_trace_writer;
+          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
+                                        FLAGS_block_cache_trace_file,
+                                        &block_cache_trace_writer);
+          if (!s.ok()) {
+            fprintf(stderr,
+                    "Encountered an error when creating trace writer, %s\n",
+                    s.ToString().c_str());
+            exit(1);
+          }
+          s = db_.db->StartBlockCacheTrace(block_cache_trace_options_,
+                                           std::move(block_cache_trace_writer));
+          if (!s.ok()) {
+            fprintf(
+                stderr,
+                "Encountered an error when starting block cache tracing, %s\n",
+                s.ToString().c_str());
+            exit(1);
+          }
+          fprintf(stdout, "Tracing block cache accesses to: [%s]\n",
+                  FLAGS_block_cache_trace_file.c_str());
+        }
 #endif  // ROCKSDB_LITE
 
         if (num_warmup > 0) {
@@ -2928,6 +3058,12 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       }
     }
 
+    if (secondary_update_thread_) {
+      secondary_update_stopped_.store(1, std::memory_order_relaxed);
+      secondary_update_thread_->join();
+      secondary_update_thread_.reset();
+    }
+
 #ifndef ROCKSDB_LITE
     if (name != "replay" && FLAGS_trace_file != "") {
       Status s = db_.db->EndTrace();
@@ -2936,6 +3072,14 @@ void VerifyDBFromDB(std::string& truth_db_name) {
                 s.ToString().c_str());
       }
     }
+    if (!FLAGS_block_cache_trace_file.empty()) {
+      Status s = db_.db->EndBlockCacheTrace();
+      if (!s.ok()) {
+        fprintf(stderr,
+                "Encountered an error ending the block cache tracing, %s\n",
+                s.ToString().c_str());
+      }
+    }
 #endif  // ROCKSDB_LITE
 
     if (FLAGS_statistics) {
@@ -2947,11 +3091,22 @@ void VerifyDBFromDB(std::string& truth_db_name) {
                   ->ToString()
                   .c_str());
     }
+
+#ifndef ROCKSDB_LITE
+    if (FLAGS_use_secondary_db) {
+      fprintf(stdout, "Secondary instance updated  %" PRIu64 " times.\n",
+              secondary_db_updates_);
+    }
+#endif  // ROCKSDB_LITE
   }
 
  private:
   std::shared_ptr<TimestampEmulator> timestamp_emulator_;
-
+  std::unique_ptr<port::Thread> secondary_update_thread_;
+  std::atomic<int> secondary_update_stopped_{0};
+#ifndef ROCKSDB_LITE
+  uint64_t secondary_db_updates_ = 0;
+#endif  // ROCKSDB_LITE
   struct ThreadArg {
     Benchmark* bm;
     SharedState* shared;
@@ -3242,8 +3397,9 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     DBOptions db_opts;
     std::vector<ColumnFamilyDescriptor> cf_descs;
     if (FLAGS_options_file != "") {
-      auto s = LoadOptionsFromFile(FLAGS_options_file, Env::Default(), &db_opts,
+      auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts,
                                    &cf_descs);
+      db_opts.env = FLAGS_env;
       if (s.ok()) {
         *opts = Options(db_opts, cf_descs[0].options);
         return true;
@@ -3264,6 +3420,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
 
     assert(db_.db == nullptr);
 
+    options.env = FLAGS_env;
     options.max_open_files = FLAGS_open_files;
     if (FLAGS_cost_write_buffer_to_cache || FLAGS_db_write_buffer_size != 0) {
       options.write_buffer_manager.reset(
@@ -3275,6 +3432,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       FLAGS_min_write_buffer_number_to_merge;
     options.max_write_buffer_number_to_maintain =
         FLAGS_max_write_buffer_number_to_maintain;
+    options.max_write_buffer_size_to_maintain =
+        FLAGS_max_write_buffer_size_to_maintain;
     options.max_background_jobs = FLAGS_max_background_jobs;
     options.max_background_compactions = FLAGS_max_background_compactions;
     options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
@@ -3508,9 +3667,10 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
     if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() > 0) {
       if (FLAGS_max_bytes_for_level_multiplier_additional_v.size() !=
-          (unsigned int)FLAGS_num_levels) {
+          static_cast<unsigned int>(FLAGS_num_levels)) {
         fprintf(stderr, "Insufficient number of fanouts specified %d\n",
-                (int)FLAGS_max_bytes_for_level_multiplier_additional_v.size());
+                static_cast<int>(
+                    FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
         exit(1);
       }
       options.max_bytes_for_level_multiplier_additional =
@@ -3552,6 +3712,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     options.enable_write_thread_adaptive_yield =
         FLAGS_enable_write_thread_adaptive_yield;
     options.enable_pipelined_write = FLAGS_enable_pipelined_write;
+    options.unordered_write = FLAGS_unordered_write;
     options.write_thread_max_yield_usec = FLAGS_write_thread_max_yield_usec;
     options.write_thread_slow_yield_usec = FLAGS_write_thread_slow_yield_usec;
     options.rate_limit_delay_max_milliseconds =
@@ -3611,6 +3772,11 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
       exit(1);
     }
+    if (FLAGS_use_secondary_db &&
+        (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
+      fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
+      exit(1);
+    }
 #endif  // ROCKSDB_LITE
 
   }
@@ -3627,6 +3793,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
     options.stats_persist_period_sec =
         static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
+    options.persist_stats_to_disk = FLAGS_persist_stats_to_disk;
     options.stats_history_buffer_size =
         static_cast<size_t>(FLAGS_stats_history_buffer_size);
 
@@ -3781,6 +3948,11 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       } else if (FLAGS_transaction_db) {
         TransactionDB* ptr;
         TransactionDBOptions txn_db_options;
+        if (options.unordered_write) {
+          options.two_write_queues = true;
+          txn_db_options.skip_concurrency_control = true;
+          txn_db_options.write_policy = WRITE_PREPARED;
+        }
         s = TransactionDB::Open(options, txn_db_options, db_name,
                                 column_families, &db->cfh, &ptr);
         if (s.ok()) {
@@ -3807,6 +3979,11 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     } else if (FLAGS_transaction_db) {
       TransactionDB* ptr = nullptr;
       TransactionDBOptions txn_db_options;
+      if (options.unordered_write) {
+        options.two_write_queues = true;
+        txn_db_options.skip_concurrency_control = true;
+        txn_db_options.write_policy = WRITE_PREPARED;
+      }
       s = CreateLoggerFromOptions(db_name, options, &options.info_log);
       if (s.ok()) {
         s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
@@ -3828,6 +4005,32 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       if (s.ok()) {
         db->db = ptr;
       }
+    } else if (FLAGS_use_secondary_db) {
+      if (FLAGS_secondary_path.empty()) {
+        std::string default_secondary_path;
+        FLAGS_env->GetTestDirectory(&default_secondary_path);
+        default_secondary_path += "/dbbench_secondary";
+        FLAGS_secondary_path = default_secondary_path;
+      }
+      s = DB::OpenAsSecondary(options, db_name, FLAGS_secondary_path, &db->db);
+      if (s.ok() && FLAGS_secondary_update_interval > 0) {
+        secondary_update_thread_.reset(new port::Thread(
+            [this](int interval, DBWithColumnFamilies* _db) {
+              while (0 == secondary_update_stopped_.load(
+                              std::memory_order_relaxed)) {
+                Status secondary_update_status =
+                    _db->db->TryCatchUpWithPrimary();
+                if (!secondary_update_status.ok()) {
+                  fprintf(stderr, "Failed to catch up with primary: %s\n",
+                          secondary_update_status.ToString().c_str());
+                  break;
+                }
+                ++secondary_db_updates_;
+                FLAGS_env->SleepForMicroseconds(interval * 1000000);
+              }
+            },
+            FLAGS_secondary_update_interval, db));
+      }
 #endif  // ROCKSDB_LITE
     } else {
       s = DB::Open(options, db_name, &db->db);
@@ -4009,10 +4212,14 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         if (use_blob_db_) {
 #ifndef ROCKSDB_LITE
           Slice val = gen.Generate(value_size_);
-          int ttl = rand() % FLAGS_blob_db_max_ttl_range;
           blob_db::BlobDB* blobdb =
               static_cast<blob_db::BlobDB*>(db_with_cfh->db);
-          s = blobdb->PutWithTTL(write_options_, key, val, ttl);
+          if (FLAGS_blob_db_max_ttl_range > 0) {
+            int ttl = rand() % FLAGS_blob_db_max_ttl_range;
+            s = blobdb->PutWithTTL(write_options_, key, val, ttl);
+          } else {
+            s = blobdb->Put(write_options_, key, val);
+          }
 #endif  //  ROCKSDB_LITE
         } else if (FLAGS_num_column_families <= 1) {
           batch.Put(key, gen.Generate(value_size_));
@@ -4451,6 +4658,65 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
   }
 
+  void ReadToRowCache(ThreadState* thread) {
+    int64_t read = 0;
+    int64_t found = 0;
+    int64_t bytes = 0;
+    int64_t key_rand = 0;
+    ReadOptions options(FLAGS_verify_checksum, true);
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    PinnableSlice pinnable_val;
+
+    while (key_rand < FLAGS_num) {
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      // We use same key_rand as seed for key and column family so that we can
+      // deterministically find the cfh corresponding to a particular key, as it
+      // is done in DoWrite method.
+      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+      key_rand++;
+      read++;
+      Status s;
+      if (FLAGS_num_column_families > 1) {
+        s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
+                                 &pinnable_val);
+      } else {
+        pinnable_val.Reset();
+        s = db_with_cfh->db->Get(options,
+                                 db_with_cfh->db->DefaultColumnFamily(), key,
+                                 &pinnable_val);
+      }
+
+      if (s.ok()) {
+        found++;
+        bytes += key.size() + pinnable_val.size();
+      } else if (!s.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+        abort();
+      }
+
+      if (thread->shared->read_rate_limiter.get() != nullptr &&
+          read % 256 == 255) {
+        thread->shared->read_rate_limiter->Request(
+            256, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+      }
+
+      thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found,
+             read);
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+
+    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
+    }
+  }
+
   void ReadReverse(ThreadState* thread) {
     if (db_.db != nullptr) {
       ReadReverse(thread, db_.db);
@@ -4652,7 +4918,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       if (FLAGS_multiread_stride) {
         int64_t key = GetRandomKey(&thread->rand);
         if ((key + (entries_per_batch_ - 1) * FLAGS_multiread_stride) >=
-            (int64_t)FLAGS_num) {
+            static_cast<int64_t>(FLAGS_num)) {
           key = FLAGS_num - entries_per_batch_ * FLAGS_multiread_stride;
         }
         for (int64_t i = 0; i < entries_per_batch_; ++i) {
@@ -4712,7 +4978,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     thread->stats.AddMessage(msg);
   }
 
-  // THe reverse function of Pareto function
+  // The inverse function of Pareto distribution
   int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
     double ret;
     if (k == 0.0) {
@@ -4722,7 +4988,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
     return static_cast<int64_t>(ceil(ret));
   }
-  // inversion of y=ax^b
+  // The inverse function of power distribution (y=ax^b)
   int64_t PowerCdfInversion(double u, double a, double b) {
     double ret;
     ret = std::pow((u / a), (1 / b));
@@ -4743,7 +5009,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
   }
 
-  // decide the query type
+  // Decide the ratio of different query types
   // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
   class QueryDecider {
    public:
@@ -4784,7 +5050,157 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
   };
 
-  // The graph wokrload mixed with Get, Put, Iterator
+  // KeyrangeUnit is the struct of a keyrange. It is used in a keyrange vector
+  // to transfer a random value to one keyrange based on the hotness.
+  struct KeyrangeUnit {
+    int64_t keyrange_start;
+    int64_t keyrange_access;
+    int64_t keyrange_keys;
+  };
+
+  // From our observations, the prefix hotness (key-range hotness) follows
+  // the two-term-exponential distribution: f(x) = a*exp(b*x) + c*exp(d*x).
+  // However, we cannot directly use the inverse function to decide a
+  // key-range from a random distribution. To achieve it, we create a list of
+  // KeyrangeUnit, each KeyrangeUnit occupies a range of integers whose size is
+  // decided based on the hotness of the key-range. When a random value is
+  // generated based on uniform distribution, we map it to the KeyrangeUnit Vec
+  // and one KeyrangeUnit is selected. The probability of a  KeyrangeUnit being
+  // selected is the same as the hotness of this KeyrangeUnit. After that, the
+  // key can be randomly allocated to the key-range of this KeyrangeUnit, or we
+  // can based on the power distribution (y=ax^b) to generate the offset of
+  // the key in the selected key-range. In this way, we generate the keyID
+  // based on the hotness of the prefix and also the key hotness distribution.
+  class GenerateTwoTermExpKeys {
+   public:
+    int64_t keyrange_rand_max_;
+    int64_t keyrange_size_;
+    int64_t keyrange_num_;
+    bool initiated_;
+    std::vector<KeyrangeUnit> keyrange_set_;
+
+    GenerateTwoTermExpKeys() {
+      keyrange_rand_max_ = FLAGS_num;
+      initiated_ = false;
+    }
+
+    ~GenerateTwoTermExpKeys() {}
+
+    // Initiate the KeyrangeUnit vector and calculate the size of each
+    // KeyrangeUnit.
+    Status InitiateExpDistribution(int64_t total_keys, double prefix_a,
+                                   double prefix_b, double prefix_c,
+                                   double prefix_d) {
+      int64_t amplify = 0;
+      int64_t keyrange_start = 0;
+      initiated_ = true;
+      if (FLAGS_keyrange_num <= 0) {
+        keyrange_num_ = 1;
+      } else {
+        keyrange_num_ = FLAGS_keyrange_num;
+      }
+      keyrange_size_ = total_keys / keyrange_num_;
+
+      // Calculate the key-range shares size based on the input parameters
+      for (int64_t pfx = keyrange_num_; pfx >= 1; pfx--) {
+        // Step 1. Calculate the probability that this key range will be
+        // accessed in a query. It is based on the two-term expoential
+        // distribution
+        double keyrange_p = prefix_a * std::exp(prefix_b * pfx) +
+                            prefix_c * std::exp(prefix_d * pfx);
+        if (keyrange_p < std::pow(10.0, -16.0)) {
+          keyrange_p = 0.0;
+        }
+        // Step 2. Calculate the amplify
+        // In order to allocate a query to a key-range based on the random
+        // number generated for this query, we need to extend the probability
+        // of each key range from [0,1] to [0, amplify]. Amplify is calculated
+        // by 1/(smallest key-range probability). In this way, we ensure that
+        // all key-ranges are assigned with an Integer that  >=0
+        if (amplify == 0 && keyrange_p > 0) {
+          amplify = static_cast<int64_t>(std::floor(1 / keyrange_p)) + 1;
+        }
+
+        // Step 3. For each key-range, we calculate its position in the
+        // [0, amplify] range, including the start, the size (keyrange_access)
+        KeyrangeUnit p_unit;
+        p_unit.keyrange_start = keyrange_start;
+        if (0.0 >= keyrange_p) {
+          p_unit.keyrange_access = 0;
+        } else {
+          p_unit.keyrange_access =
+              static_cast<int64_t>(std::floor(amplify * keyrange_p));
+        }
+        p_unit.keyrange_keys = keyrange_size_;
+        keyrange_set_.push_back(p_unit);
+        keyrange_start += p_unit.keyrange_access;
+      }
+      keyrange_rand_max_ = keyrange_start;
+
+      // Step 4. Shuffle the key-ranges randomly
+      // Since the access probability is calculated from small to large,
+      // If we do not re-allocate them, hot key-ranges are always at the end
+      // and cold key-ranges are at the begin of the key space. Therefore, the
+      // key-ranges are shuffled and the rand seed is only decide by the
+      // key-range hotness distribution. With the same distribution parameters
+      // the shuffle results are the same.
+      Random64 rand_loca(keyrange_rand_max_);
+      for (int64_t i = 0; i < FLAGS_keyrange_num; i++) {
+        int64_t pos = rand_loca.Next() % FLAGS_keyrange_num;
+        assert(i >= 0 && i < static_cast<int64_t>(keyrange_set_.size()) &&
+               pos >= 0 && pos < static_cast<int64_t>(keyrange_set_.size()));
+        std::swap(keyrange_set_[i], keyrange_set_[pos]);
+      }
+
+      // Step 5. Recalculate the prefix start postion after shuffling
+      int64_t offset = 0;
+      for (auto& p_unit : keyrange_set_) {
+        p_unit.keyrange_start = offset;
+        offset += p_unit.keyrange_access;
+      }
+
+      return Status::OK();
+    }
+
+    // Generate the Key ID according to the input ini_rand and key distribution
+    int64_t DistGetKeyID(int64_t ini_rand, double key_dist_a,
+                         double key_dist_b) {
+      int64_t keyrange_rand = ini_rand % keyrange_rand_max_;
+
+      // Calculate and select one key-range that contains the new key
+      int64_t start = 0, end = static_cast<int64_t>(keyrange_set_.size());
+      while (start + 1 < end) {
+        int64_t mid = start + (end - start) / 2;
+        assert(mid >= 0 && mid < static_cast<int64_t>(keyrange_set_.size()));
+        if (keyrange_rand < keyrange_set_[mid].keyrange_start) {
+          end = mid;
+        } else {
+          start = mid;
+        }
+      }
+      int64_t keyrange_id = start;
+
+      // Select one key in the key-range and compose the keyID
+      int64_t key_offset = 0, key_seed;
+      if (key_dist_a == 0.0 && key_dist_b == 0.0) {
+        key_offset = ini_rand % keyrange_size_;
+      } else {
+        key_seed = static_cast<int64_t>(
+            ceil(std::pow((ini_rand / key_dist_a), (1 / key_dist_b))));
+        Random64 rand_key(key_seed);
+        key_offset = static_cast<int64_t>(rand_key.Next()) % keyrange_size_;
+      }
+      return keyrange_size_ * keyrange_id + key_offset;
+    }
+  };
+
+  // The social graph wokrload mixed with Get, Put, Iterator queries.
+  // The value size and iterator length follow Pareto distribution.
+  // The overall key access follow power distribution. If user models the
+  // workload based on different key-ranges (or different prefixes), user
+  // can use two-term-exponential distribution to fit the workload. User
+  // needs to decides the ratio between Get, Put, Iterator queries before
+  // starting the benchmark.
   void MixGraph(ThreadState* thread) {
     int64_t read = 0;  // including single gets and Next of iterators
     int64_t gets = 0;
@@ -4798,6 +5214,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     int64_t scan_len_max = FLAGS_mix_max_scan_len;
     double write_rate = 1000000.0;
     double read_rate = 1000000.0;
+    bool use_prefix_modeling = false;
+    GenerateTwoTermExpKeys gen_exp;
     std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
                               FLAGS_mix_seek_ratio};
     char value_buffer[default_value_max];
@@ -4823,15 +5241,32 @@ void VerifyDBFromDB(std::string& truth_db_name) {
           NewGenericRateLimiter(static_cast<int64_t>(write_rate)));
     }
 
+    // Decide if user wants to use prefix based key generation
+    if (FLAGS_keyrange_dist_a != 0.0 || FLAGS_keyrange_dist_b != 0.0 ||
+        FLAGS_keyrange_dist_c != 0.0 || FLAGS_keyrange_dist_d != 0.0) {
+      use_prefix_modeling = true;
+      gen_exp.InitiateExpDistribution(
+          FLAGS_num, FLAGS_keyrange_dist_a, FLAGS_keyrange_dist_b,
+          FLAGS_keyrange_dist_c, FLAGS_keyrange_dist_d);
+    }
+
     Duration duration(FLAGS_duration, reads_);
     while (!duration.Done(1)) {
       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
-      int64_t rand_v, key_rand, key_seed;
-      rand_v = GetRandomKey(&thread->rand) % FLAGS_num;
+      int64_t ini_rand, rand_v, key_rand, key_seed;
+      ini_rand = GetRandomKey(&thread->rand);
+      rand_v = ini_rand % FLAGS_num;
       double u = static_cast<double>(rand_v) / FLAGS_num;
-      key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
-      Random64 rand(key_seed);
-      key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
+
+      // Generate the keyID based on the key hotness and prefix hotness
+      if (use_prefix_modeling) {
+        key_rand =
+            gen_exp.DistGetKeyID(ini_rand, FLAGS_key_dist_a, FLAGS_key_dist_b);
+      } else {
+        key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
+        Random64 rand(key_seed);
+        key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
+      }
       GenerateKeyFromInt(key_rand, FLAGS_num, &key);
       int query_type = query.GetType(rand_v);
 
@@ -5022,9 +5457,10 @@ void VerifyDBFromDB(std::string& truth_db_name) {
               FLAGS_num, &lower_bound);
           options.iterate_lower_bound = &lower_bound;
         } else {
-          GenerateKeyFromInt(
-              (uint64_t)std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance),
-              FLAGS_num, &upper_bound);
+          auto min_num =
+              std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance);
+          GenerateKeyFromInt(static_cast<uint64_t>(min_num), FLAGS_num,
+                             &upper_bound);
           options.iterate_upper_bound = &upper_bound;
         }
       }
@@ -5192,7 +5628,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
             // Wait for the writes to be finished
             if (!hint_printed) {
               fprintf(stderr, "Reads are finished. Have %d more writes to do\n",
-                      (int)writes_ - written);
+                      static_cast<int>(writes_) - written);
               hint_printed = true;
             }
           } else {
@@ -5776,6 +6212,97 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
   }
 
+  bool binary_search(std::vector<int>& data, int start, int end, int key) {
+    if (data.empty()) return false;
+    if (start > end) return false;
+    int mid = start + (end - start) / 2;
+    if (mid > static_cast<int>(data.size()) - 1) return false;
+    if (data[mid] == key) {
+      return true;
+    } else if (data[mid] > key) {
+      return binary_search(data, start, mid - 1, key);
+    } else {
+      return binary_search(data, mid + 1, end, key);
+    }
+  }
+
+  // Does a bunch of merge operations for a key(key1) where the merge operand
+  // is a sorted list. Next performance comparison is done between doing a Get
+  // for key1 followed by searching for another key(key2) in the large sorted
+  // list vs calling GetMergeOperands for key1 and then searching for the key2
+  // in all the sorted sub-lists. Later case is expected to be a lot faster.
+  void GetMergeOperands(ThreadState* thread) {
+    DB* db = SelectDB(thread);
+    const int kTotalValues = 100000;
+    const int kListSize = 100;
+    std::string key = "my_key";
+    std::string value;
+
+    for (int i = 1; i < kTotalValues; i++) {
+      if (i % kListSize == 0) {
+        // Remove trailing ','
+        value.pop_back();
+        db->Merge(WriteOptions(), key, value);
+        value.clear();
+      } else {
+        value.append(std::to_string(i)).append(",");
+      }
+    }
+
+    SortList s;
+    std::vector<int> data;
+    // This value can be experimented with and it will demonstrate the
+    // perf difference between doing a Get and searching for lookup_key in the
+    // resultant large sorted list vs doing GetMergeOperands and searching
+    // for lookup_key within this resultant sorted sub-lists.
+    int lookup_key = 1;
+
+    // Get API call
+    std::cout << "--- Get API call --- \n";
+    PinnableSlice p_slice;
+    uint64_t st = FLAGS_env->NowNanos();
+    db->Get(ReadOptions(), db->DefaultColumnFamily(), key, &p_slice);
+    s.MakeVector(data, p_slice);
+    bool found =
+        binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
+    std::cout << "Found key? " << std::to_string(found) << "\n";
+    uint64_t sp = FLAGS_env->NowNanos();
+    std::cout << "Get: " << (sp - st) / 1000000000.0 << " seconds\n";
+    std::string* dat_ = p_slice.GetSelf();
+    std::cout << "Sample data from Get API call: " << dat_->substr(0, 10)
+              << "\n";
+    data.clear();
+
+    // GetMergeOperands API call
+    std::cout << "--- GetMergeOperands API --- \n";
+    std::vector<PinnableSlice> a_slice((kTotalValues / kListSize) + 1);
+    st = FLAGS_env->NowNanos();
+    int number_of_operands = 0;
+    GetMergeOperandsOptions get_merge_operands_options;
+    get_merge_operands_options.expected_max_number_of_operands =
+        (kTotalValues / 100) + 1;
+    db->GetMergeOperands(ReadOptions(), db->DefaultColumnFamily(), key,
+                         a_slice.data(), &get_merge_operands_options,
+                         &number_of_operands);
+    for (PinnableSlice& psl : a_slice) {
+      s.MakeVector(data, psl);
+      found =
+          binary_search(data, 0, static_cast<int>(data.size() - 1), lookup_key);
+      data.clear();
+      if (found) break;
+    }
+    std::cout << "Found key? " << std::to_string(found) << "\n";
+    sp = FLAGS_env->NowNanos();
+    std::cout << "Get Merge operands: " << (sp - st) / 1000000000.0
+              << " seconds \n";
+    int to_print = 0;
+    std::cout << "Sample data from GetMergeOperands API call: ";
+    for (PinnableSlice& psl : a_slice) {
+      std::cout << "List: " << to_print << " : " << *psl.GetSelf() << "\n";
+      if (to_print++ > 2) break;
+    }
+  }
+
 #ifndef ROCKSDB_LITE
   // This benchmark stress tests Transactions.  For a given --duration (or
   // total number of --writes, a Transaction will perform a read-modify-write
@@ -6118,6 +6645,39 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
   }
 
+  void PrintStatsHistory() {
+    if (db_.db != nullptr) {
+      PrintStatsHistoryImpl(db_.db, false);
+    }
+    for (const auto& db_with_cfh : multi_dbs_) {
+      PrintStatsHistoryImpl(db_with_cfh.db, true);
+    }
+  }
+
+  void PrintStatsHistoryImpl(DB* db, bool print_header) {
+    if (print_header) {
+      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+    }
+
+    std::unique_ptr<StatsHistoryIterator> shi;
+    Status s = db->GetStatsHistory(0, port::kMaxUint64, &shi);
+    if (!s.ok()) {
+      fprintf(stdout, "%s\n", s.ToString().c_str());
+      return;
+    }
+    assert(shi);
+    while (shi->Valid()) {
+      uint64_t stats_time = shi->GetStatsTime();
+      fprintf(stdout, "------ %s ------\n",
+              TimeToHumanString(static_cast<int>(stats_time)).c_str());
+      for (auto& entry : shi->GetStatsMap()) {
+        fprintf(stdout, " %" PRIu64 "   %s  %" PRIu64 "\n", stats_time,
+                entry.first.c_str(), entry.second);
+      }
+      shi->Next();
+    }
+  }
+
   void PrintStats(const char* key) {
     if (db_.db != nullptr) {
       PrintStats(db_.db, key, false);
@@ -6159,7 +6719,10 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
     Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
                       std::move(trace_reader));
-    s = replayer.Replay();
+    replayer.SetFastForward(
+        static_cast<uint32_t>(FLAGS_trace_replay_fast_forward));
+    s = replayer.MultiThreadReplay(
+        static_cast<uint32_t>(FLAGS_trace_replay_threads));
     if (s.ok()) {
       fprintf(stdout, "Replay started from trace_file: %s\n",
               FLAGS_trace_file.c_str());
@@ -6187,13 +6750,12 @@ int db_bench_tool(int argc, char** argv) {
     exit(1);
   }
   if (!FLAGS_statistics_string.empty()) {
-    std::unique_ptr<Statistics> custom_stats_guard;
-    dbstats.reset(NewCustomObject<Statistics>(FLAGS_statistics_string,
-                                              &custom_stats_guard));
-    custom_stats_guard.release();
+    Status s = ObjectRegistry::NewInstance()->NewSharedObject<Statistics>(
+        FLAGS_statistics_string, &dbstats);
     if (dbstats == nullptr) {
-      fprintf(stderr, "No Statistics registered matching string: %s\n",
-              FLAGS_statistics_string.c_str());
+      fprintf(stderr,
+              "No Statistics registered matching string: %s status=%s\n",
+              FLAGS_statistics_string.c_str(), s.ToString().c_str());
       exit(1);
     }
   }
@@ -6221,12 +6783,11 @@ int db_bench_tool(int argc, char** argv) {
     StringToCompressionType(FLAGS_compression_type.c_str());
 
 #ifndef ROCKSDB_LITE
-  std::unique_ptr<Env> custom_env_guard;
   if (!FLAGS_hdfs.empty() && !FLAGS_env_uri.empty()) {
     fprintf(stderr, "Cannot provide both --hdfs and --env_uri.\n");
     exit(1);
   } else if (!FLAGS_env_uri.empty()) {
-    FLAGS_env = NewCustomObject<Env>(FLAGS_env_uri, &custom_env_guard);
+    Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
     if (FLAGS_env == nullptr) {
       fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
       exit(1);
@@ -6271,7 +6832,7 @@ int db_bench_tool(int argc, char** argv) {
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db.empty()) {
     std::string default_db_path;
-    rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
+    FLAGS_env->GetTestDirectory(&default_db_path);
     default_db_path += "/dbbench";
     FLAGS_db = default_db_path;
   }
diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc
index 1b19de5f17e..87f8f1943a2 100644
--- a/tools/db_bench_tool_test.cc
+++ b/tools/db_bench_tool_test.cc
@@ -10,9 +10,9 @@
 #include "rocksdb/db_bench_tool.h"
 #include "options/options_parser.h"
 #include "rocksdb/utilities/options_util.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 #ifdef GFLAGS
 #include "util/gflags_compat.h"
@@ -245,6 +245,7 @@ const std::string options_file_content = R"OPTIONS_FILE(
   expanded_compaction_factor=25
   soft_rate_limit=0.000000
   max_write_buffer_number_to_maintain=0
+  max_write_buffer_size_to_maintain=0
   verify_checksums_in_compaction=true
   merge_operator=nullptr
   memtable_prefix_bloom_bits=0
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index a1a9ecb66ea..e88f9492030 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#!/usr/bin/env python2
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import os
 import sys
@@ -16,15 +16,16 @@
 #       default_params < {blackbox,whitebox}_default_params <
 #       simple_default_params <
 #       {blackbox,whitebox}_simple_default_params < args
-#   for enable_atomic_flush:
+#   for cf_consistency:
 #       default_params < {blackbox,whitebox}_default_params <
-#       atomic_flush_params < args
+#       cf_consistency_params < args
 
 expected_values_file = tempfile.NamedTemporaryFile()
 
 default_params = {
     "acquire_snapshot_one_in": 10000,
     "block_size": 16384,
+    "cache_index_and_filter_blocks": lambda: random.randint(0, 1),
     "cache_size": 1048576,
     "checkpoint_one_in": 1000000,
     "compression_type": "snappy",
@@ -39,13 +40,16 @@
     "enable_pipelined_write": lambda: random.randint(0, 1),
     "expected_values_path": expected_values_file.name,
     "flush_one_in": 1000000,
+    # Temporarily disable hash index
+    "index_type": lambda: random.choice([0,2]),
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,
     "max_write_buffer_number": 3,
     "mmap_read": lambda: random.randint(0, 1),
     "nooverwritepercent": 1,
-    "open_files": 500000,
+    "open_files": lambda : random.choice([-1, 500000]),
+    "partition_filters": lambda: random.randint(0, 1),
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,
@@ -64,6 +68,14 @@
     "writepercent": 35,
     "format_version": lambda: random.randint(2, 4),
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
+    "use_multiget" : lambda: random.randint(0, 1),
+    "periodic_compaction_seconds" :
+        lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+    "compaction_ttl" : lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+    # Test small max_manifest_file_size in a smaller chance, as most of the
+    # time we wnat manifest history to be preserved to help debug
+    "max_manifest_file_size" : lambda : random.choice(
+        [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1,30)])
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
@@ -114,8 +126,9 @@ def is_direct_io_supported(dbname):
     "max_background_compactions": 1,
     "max_bytes_for_level_base": 67108864,
     "memtablerep": "skip_list",
-    "prefixpercent": 25,
-    "readpercent": 25,
+    "prefixpercent": 0,
+    "readpercent": 50,
+    "prefix_size" : -1,
     "target_file_size_base": 16777216,
     "target_file_size_multiplier": 1,
     "test_batches_snapshots": 0,
@@ -129,13 +142,14 @@ def is_direct_io_supported(dbname):
 
 whitebox_simple_default_params = {}
 
-atomic_flush_params = {
-    "disable_wal": 1,
+cf_consistency_params = {
+    "disable_wal": lambda: random.randint(0, 1),
     "reopen": 0,
-    "test_atomic_flush": 1,
+    "test_cf_consistency": 1,
     # use small value for write_buffer_size so that RocksDB triggers flush
     # more frequently
     "write_buffer_size": 1024 * 1024,
+    "enable_pipelined_write": lambda: random.randint(0, 1),
 }
 
 
@@ -154,9 +168,28 @@ def finalize_and_sanitize(src_params):
     if dest_params.get("test_batches_snapshots") == 1:
         dest_params["delpercent"] += dest_params["delrangepercent"]
         dest_params["delrangepercent"] = 0
+    if dest_params.get("disable_wal", 0) == 1:
+        dest_params["atomic_flush"] = 1
+    if dest_params.get("open_files", 1) != -1:
+        # Compaction TTL and periodic compactions are only compatible
+        # with open_files = -1
+        dest_params["compaction_ttl"] = 0
+        dest_params["periodic_compaction_seconds"] = 0
+    if dest_params.get("compaction_style", 0) == 2:
+        # Disable compaction TTL in FIFO compaction, because right
+        # now assertion failures are triggered.
+        dest_params["compaction_ttl"] = 0
+        dest_params["periodic_compaction_seconds"] = 0
+    if dest_params["partition_filters"] == 1:
+        if dest_params["index_type"] != 2:
+            dest_params["partition_filters"] = 0
+        else:
+            dest_params["use_block_based_filter"] = 0
+    if dest_params.get("atomic_flush", 0) == 1:
+        # disable pipelined write when atomic flush is used.
+        dest_params["enable_pipelined_write"] = 0
     return dest_params
 
-
 def gen_cmd_params(args):
     params = {}
 
@@ -171,8 +204,8 @@ def gen_cmd_params(args):
             params.update(blackbox_simple_default_params)
         if args.test_type == 'whitebox':
             params.update(whitebox_simple_default_params)
-    if args.enable_atomic_flush:
-        params.update(atomic_flush_params)
+    if args.cf_consistency:
+        params.update(cf_consistency_params)
 
     for k, v in vars(args).items():
         if v is not None:
@@ -185,7 +218,7 @@ def gen_cmd(params, unknown_params):
         '--{0}={1}'.format(k, v)
         for k, v in finalize_and_sanitize(params).items()
         if k not in set(['test_type', 'simple', 'duration', 'interval',
-                         'random_kill_odd', 'enable_atomic_flush'])
+                         'random_kill_odd', 'cf_consistency'])
         and v is not None] + unknown_params
     return cmd
 
@@ -282,8 +315,12 @@ def whitebox_crash_main(args, unknown_args):
                     "kill_random_test": kill_random_test,
                 })
             elif kill_mode == 1:
+                if cmd_params.get('disable_wal', 0) == 1:
+                    my_kill_odd = kill_random_test / 50 + 1
+                else:
+                    my_kill_odd = kill_random_test / 10 + 1
                 additional_opts.update({
-                    "kill_random_test": (kill_random_test / 10 + 1),
+                    "kill_random_test": my_kill_odd,
                     "kill_prefix_blacklist": "WritableFileWriter::Append,"
                     + "WritableFileWriter::WriteBuffered",
                 })
@@ -339,8 +376,9 @@ def whitebox_crash_main(args, unknown_args):
         if additional_opts['kill_random_test'] is None and (retncode == 0):
             # we expect zero retncode if no kill option
             expected = True
-        elif additional_opts['kill_random_test'] is not None and retncode < 0:
-            # we expect negative retncode if kill option was given
+        elif additional_opts['kill_random_test'] is not None and retncode <= 0:
+            # When kill option is given, the test MIGHT kill itself.
+            # If it does, negative retncode is expected. Otherwise 0.
             expected = True
 
         if not expected:
@@ -377,7 +415,7 @@ def main():
         db_stress multiple times")
     parser.add_argument("test_type", choices=["blackbox", "whitebox"])
     parser.add_argument("--simple", action="store_true")
-    parser.add_argument("--enable_atomic_flush", action='store_true')
+    parser.add_argument("--cf_consistency", action='store_true')
 
     all_params = dict(default_params.items()
                       + blackbox_default_params.items()
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index c640b5945b0..c7ad71738fa 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -18,8 +18,8 @@ int main() {
 #include "db/write_batch_internal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/types.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
-#include "util/testutil.h"
 
 // Run a thread to perform Put's.
 // Another thread uses GetUpdatesSince API to keep getting the updates.
diff --git a/tools/db_stress.cc b/tools/db_stress.cc
index 7f8c4b53f7b..adc159ae6b7 100644
--- a/tools/db_stress.cc
+++ b/tools/db_stress.cc
@@ -6,4080 +6,16 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
-// The test uses an array to compare against values written to the database.
-// Keys written to the array are in 1:1 correspondence to the actual values in
-// the database according to the formula in the function GenerateValue.
-
-// Space is reserved in the array from 0 to FLAGS_max_key and values are
-// randomly written/deleted/read from those positions. During verification we
-// compare all the positions in the array. To shorten/elongate the running
-// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
-// (sometimes also FLAGS_threads).
-//
-// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
-// different behavior. See comment of the flag for details.
 
 #ifndef GFLAGS
 #include <cstdio>
+
 int main() {
   fprintf(stderr, "Please install gflags to run rocksdb tools\n");
   return 1;
 }
 #else
+#include <rocksdb/db_stress_tool.h>
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif  // __STDC_FORMAT_MACROS
-
-#include <fcntl.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <algorithm>
-#include <chrono>
-#include <exception>
-#include <queue>
-#include <thread>
-
-#include "db/db_impl.h"
-#include "db/version_set.h"
-#include "hdfs/env_hdfs.h"
-#include "monitoring/histogram.h"
-#include "options/options_helper.h"
-#include "port/port.h"
-#include "rocksdb/cache.h"
-#include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/utilities/backupable_db.h"
-#include "rocksdb/utilities/checkpoint.h"
-#include "rocksdb/utilities/db_ttl.h"
-#include "rocksdb/utilities/options_util.h"
-#include "rocksdb/utilities/transaction.h"
-#include "rocksdb/utilities/transaction_db.h"
-#include "rocksdb/write_batch.h"
-#include "util/coding.h"
-#include "util/compression.h"
-#include "util/crc32c.h"
-#include "util/gflags_compat.h"
-#include "util/logging.h"
-#include "util/mutexlock.h"
-#include "util/random.h"
-#include "util/string_util.h"
-// SyncPoint is not supported in Released Windows Mode.
-#if !(defined NDEBUG) || !defined(OS_WIN)
-#include "util/sync_point.h"
-#endif  // !(defined NDEBUG) || !defined(OS_WIN)
-#include "util/testutil.h"
-
-#include "utilities/merge_operators.h"
-
-using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-using GFLAGS_NAMESPACE::RegisterFlagValidator;
-using GFLAGS_NAMESPACE::SetUsageMessage;
-
-static const long KB = 1024;
-static const int kRandomValueMaxFactor = 3;
-static const int kValueMaxLen = 100;
-
-static bool ValidateUint32Range(const char* flagname, uint64_t value) {
-  if (value > std::numeric_limits<uint32_t>::max()) {
-    fprintf(stderr,
-            "Invalid value for --%s: %lu, overflow\n",
-            flagname,
-            (unsigned long)value);
-    return false;
-  }
-  return true;
-}
-
-DEFINE_uint64(seed, 2341234, "Seed for PRNG");
-static const bool FLAGS_seed_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
-
-DEFINE_bool(read_only, false, "True if open DB in read-only mode during tests");
-
-DEFINE_int64(max_key, 1 * KB* KB,
-             "Max number of key/values to place in database");
-
-DEFINE_int32(column_families, 10, "Number of column families");
-
-DEFINE_string(
-    options_file, "",
-    "The path to a RocksDB options file.  If specified, then db_stress will "
-    "run with the RocksDB options in the default column family of the "
-    "specified options file. Note that, when an options file is provided, "
-    "db_stress will ignore the flag values for all options that may be passed "
-    "via options file.");
-
-DEFINE_int64(
-    active_width, 0,
-    "Number of keys in active span of the key-range at any given time. The "
-    "span begins with its left endpoint at key 0, gradually moves rightwards, "
-    "and ends with its right endpoint at max_key. If set to 0, active_width "
-    "will be sanitized to be equal to max_key.");
-
-// TODO(noetzli) Add support for single deletes
-DEFINE_bool(test_batches_snapshots, false,
-            "If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
-            " which read/write/delete multiple keys in a batch. In this mode,"
-            " we do not verify db content by comparing the content with the "
-            "pre-allocated array. Instead, we do partial verification inside"
-            " MultiGet() by checking various values in a batch. Benefit of"
-            " this mode:\n"
-            "\t(a) No need to acquire mutexes during writes (less cache "
-            "flushes in multi-core leading to speed up)\n"
-            "\t(b) No long validation at the end (more speed up)\n"
-            "\t(c) Test snapshot and atomicity of batch writes");
-
-DEFINE_bool(atomic_flush, false,
-            "If set, enables atomic flush in the options.\n");
-
-DEFINE_bool(test_atomic_flush, false,
-            "If set, runs the stress test dedicated to verifying atomic flush "
-            "functionality. Setting this implies `atomic_flush=true`.\n");
-
-DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
-
-DEFINE_int32(ttl, -1,
-             "Opens the db with this ttl value if this is not -1. "
-             "Carefully specify a large value such that verifications on "
-             "deleted values don't fail");
-
-DEFINE_int32(value_size_mult, 8,
-             "Size of value will be this number times rand_int(1,3) bytes");
-
-DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
-
-DEFINE_bool(enable_pipelined_write, false, "Pipeline WAL/memtable writes");
-
-DEFINE_bool(verify_before_write, false, "Verify before write");
-
-DEFINE_bool(histogram, false, "Print histogram of operation timings");
-
-DEFINE_bool(destroy_db_initially, true,
-            "Destroys the database dir before start if this is true");
-
-DEFINE_bool(verbose, false, "Verbose");
-
-DEFINE_bool(progress_reports, true,
-            "If true, db_stress will report number of finished operations");
-
-DEFINE_uint64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
-              "Number of bytes to buffer in all memtables before compacting");
-
-DEFINE_int32(write_buffer_size,
-             static_cast<int32_t>(rocksdb::Options().write_buffer_size),
-             "Number of bytes to buffer in memtable before compacting");
-
-DEFINE_int32(max_write_buffer_number,
-             rocksdb::Options().max_write_buffer_number,
-             "The number of in-memory memtables. "
-             "Each memtable is of size FLAGS_write_buffer_size.");
-
-DEFINE_int32(min_write_buffer_number_to_merge,
-             rocksdb::Options().min_write_buffer_number_to_merge,
-             "The minimum number of write buffers that will be merged together "
-             "before writing to storage. This is cheap because it is an "
-             "in-memory merge. If this feature is not enabled, then all these "
-             "write buffers are flushed to L0 as separate files and this "
-             "increases read amplification because a get request has to check "
-             "in all of these files. Also, an in-memory merge may result in "
-             "writing less data to storage if there are duplicate records in"
-             " each of these individual write buffers.");
-
-DEFINE_int32(max_write_buffer_number_to_maintain,
-             rocksdb::Options().max_write_buffer_number_to_maintain,
-             "The total maximum number of write buffers to maintain in memory "
-             "including copies of buffers that have already been flushed. "
-             "Unlike max_write_buffer_number, this parameter does not affect "
-             "flushing. This controls the minimum amount of write history "
-             "that will be available in memory for conflict checking when "
-             "Transactions are used. If this value is too low, some "
-             "transactions may fail at commit time due to not being able to "
-             "determine whether there were any write conflicts. Setting this "
-             "value to 0 will cause write buffers to be freed immediately "
-             "after they are flushed.  If this value is set to -1, "
-             "'max_write_buffer_number' will be used.");
-
-DEFINE_double(memtable_prefix_bloom_size_ratio,
-              rocksdb::Options().memtable_prefix_bloom_size_ratio,
-              "creates prefix blooms for memtables, each with size "
-              "`write_buffer_size * memtable_prefix_bloom_size_ratio`.");
-
-DEFINE_bool(memtable_whole_key_filtering,
-            rocksdb::Options().memtable_whole_key_filtering,
-            "Enable whole key filtering in memtables.");
-
-DEFINE_int32(open_files, rocksdb::Options().max_open_files,
-             "Maximum number of files to keep open at the same time "
-             "(use default if == 0)");
-
-DEFINE_int64(compressed_cache_size, -1,
-             "Number of bytes to use as a cache of compressed data."
-             " Negative means use default settings.");
-
-DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, "");
-
-DEFINE_int32(level0_file_num_compaction_trigger,
-             rocksdb::Options().level0_file_num_compaction_trigger,
-             "Level0 compaction start trigger");
-
-DEFINE_int32(level0_slowdown_writes_trigger,
-             rocksdb::Options().level0_slowdown_writes_trigger,
-             "Number of files in level-0 that will slow down writes");
-
-DEFINE_int32(level0_stop_writes_trigger,
-             rocksdb::Options().level0_stop_writes_trigger,
-             "Number of files in level-0 that will trigger put stop.");
-
-DEFINE_int32(block_size,
-             static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
-             "Number of bytes in a block.");
-
-DEFINE_int32(
-    format_version,
-    static_cast<int32_t>(rocksdb::BlockBasedTableOptions().format_version),
-    "Format version of SST files.");
-
-DEFINE_int32(index_block_restart_interval,
-             rocksdb::BlockBasedTableOptions().index_block_restart_interval,
-             "Number of keys between restart points "
-             "for delta encoding of keys in index block.");
-
-DEFINE_int32(max_background_compactions,
-             rocksdb::Options().max_background_compactions,
-             "The maximum number of concurrent background compactions "
-             "that can occur in parallel.");
-
-DEFINE_int32(num_bottom_pri_threads, 0,
-             "The number of threads in the bottom-priority thread pool (used "
-             "by universal compaction only).");
-
-DEFINE_int32(compaction_thread_pool_adjust_interval, 0,
-             "The interval (in milliseconds) to adjust compaction thread pool "
-             "size. Don't change it periodically if the value is 0.");
-
-DEFINE_int32(compaction_thread_pool_variations, 2,
-             "Range of background thread pool size variations when adjusted "
-             "periodically.");
-
-DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
-             "The maximum number of concurrent background flushes "
-             "that can occur in parallel.");
-
-DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
-             " compaction in universal style");
-
-DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to "
-             "compact in universal style compaction");
-
-DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
-             " in universal style compaction");
-
-DEFINE_int32(universal_max_size_amplification_percent, 0,
-             "The max size amplification for universal style compaction");
-
-DEFINE_int32(clear_column_family_one_in, 1000000,
-             "With a chance of 1/N, delete a column family and then recreate "
-             "it again. If N == 0, never drop/create column families. "
-             "When test_batches_snapshots is true, this flag has no effect");
-
-DEFINE_int32(set_options_one_in, 0,
-             "With a chance of 1/N, change some random options");
-
-DEFINE_int32(set_in_place_one_in, 0,
-             "With a chance of 1/N, toggle in place support option");
-
-DEFINE_int64(cache_size, 2LL * KB * KB * KB,
-             "Number of bytes to use as a cache of uncompressed data.");
-
-DEFINE_bool(use_clock_cache, false,
-            "Replace default LRU block cache with clock cache.");
-
-DEFINE_uint64(subcompactions, 1,
-              "Maximum number of subcompactions to divide L0-L1 compactions "
-              "into.");
-
-DEFINE_bool(allow_concurrent_memtable_write, false,
-            "Allow multi-writers to update mem tables in parallel.");
-
-DEFINE_bool(enable_write_thread_adaptive_yield, true,
-            "Use a yielding spin loop for brief writer thread waits.");
-
-static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
-
-static bool ValidateInt32Positive(const char* flagname, int32_t value) {
-  if (value < 0) {
-    fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-DEFINE_int32(reopen, 10, "Number of times database reopens");
-static const bool FLAGS_reopen_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
-
-DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
-             "Negative means use default settings.");
-
-DEFINE_bool(use_block_based_filter, false, "use block based filter"
-              "instead of full filter for block based table");
-
-DEFINE_string(db, "", "Use the db with the following name.");
-
-DEFINE_string(
-    expected_values_path, "",
-    "File where the array of expected uint32_t values will be stored. If "
-    "provided and non-empty, the DB state will be verified against these "
-    "values after recovery. --max_key and --column_family must be kept the "
-    "same across invocations of this program that use the same "
-    "--expected_values_path.");
-
-DEFINE_bool(verify_checksum, false,
-            "Verify checksum for every block read from storage");
-
-DEFINE_bool(mmap_read, rocksdb::Options().allow_mmap_reads,
-            "Allow reads to occur via mmap-ing files");
-
-DEFINE_bool(mmap_write, rocksdb::Options().allow_mmap_writes,
-            "Allow writes to occur via mmap-ing files");
-
-DEFINE_bool(use_direct_reads, rocksdb::Options().use_direct_reads,
-            "Use O_DIRECT for reading data");
-
-DEFINE_bool(use_direct_io_for_flush_and_compaction,
-            rocksdb::Options().use_direct_io_for_flush_and_compaction,
-            "Use O_DIRECT for writing data");
-
-// Database statistics
-static std::shared_ptr<rocksdb::Statistics> dbstats;
-DEFINE_bool(statistics, false, "Create database statistics");
-
-DEFINE_bool(sync, false, "Sync all writes to disk");
-
-DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
-
-DEFINE_int32(kill_random_test, 0,
-             "If non-zero, kill at various points in source code with "
-             "probability 1/this");
-static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
-extern int rocksdb_kill_odds;
-
-DEFINE_string(kill_prefix_blacklist, "",
-              "If non-empty, kill points with prefix in the list given will be"
-              " skipped. Items are comma-separated.");
-extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
-
-DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
-
-DEFINE_uint64(recycle_log_file_num, rocksdb::Options().recycle_log_file_num,
-              "Number of old WAL files to keep around for later recycling");
-
-DEFINE_int64(target_file_size_base, rocksdb::Options().target_file_size_base,
-             "Target level-1 file size for compaction");
-
-DEFINE_int32(target_file_size_multiplier, 1,
-             "A multiplier to compute target level-N file size (N >= 2)");
-
-DEFINE_uint64(max_bytes_for_level_base,
-              rocksdb::Options().max_bytes_for_level_base,
-              "Max bytes for level-1");
-
-DEFINE_double(max_bytes_for_level_multiplier, 2,
-              "A multiplier to compute max bytes for level-N (N >= 2)");
-
-DEFINE_int32(range_deletion_width, 10,
-             "The width of the range deletion intervals.");
-
-DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
-
-DEFINE_bool(rate_limit_bg_reads, false,
-            "Use options.rate_limiter on compaction reads");
-
-DEFINE_bool(use_txn, false,
-            "Use TransactionDB. Currently the default write policy is "
-            "TxnDBWritePolicy::WRITE_PREPARED");
-
-DEFINE_int32(backup_one_in, 0,
-             "If non-zero, then CreateNewBackup() will be called once for "
-             "every N operations on average.  0 indicates CreateNewBackup() "
-             "is disabled.");
-
-DEFINE_int32(checkpoint_one_in, 0,
-             "If non-zero, then CreateCheckpoint() will be called once for "
-             "every N operations on average.  0 indicates CreateCheckpoint() "
-             "is disabled.");
-
-DEFINE_int32(ingest_external_file_one_in, 0,
-             "If non-zero, then IngestExternalFile() will be called once for "
-             "every N operations on average.  0 indicates IngestExternalFile() "
-             "is disabled.");
-
-DEFINE_int32(ingest_external_file_width, 1000,
-             "The width of the ingested external files.");
-
-DEFINE_int32(compact_files_one_in, 0,
-             "If non-zero, then CompactFiles() will be called once for every N "
-             "operations on average.  0 indicates CompactFiles() is disabled.");
-
-DEFINE_int32(compact_range_one_in, 0,
-             "If non-zero, then CompactRange() will be called once for every N "
-             "operations on average.  0 indicates CompactRange() is disabled.");
-
-DEFINE_int32(flush_one_in, 0,
-             "If non-zero, then Flush() will be called once for every N ops "
-             "on average.  0 indicates calls to Flush() are disabled.");
-
-DEFINE_int32(compact_range_width, 10000,
-             "The width of the ranges passed to CompactRange().");
-
-DEFINE_int32(acquire_snapshot_one_in, 0,
-             "If non-zero, then acquires a snapshot once every N operations on "
-             "average.");
-
-DEFINE_bool(compare_full_db_state_snapshot, false,
-            "If set we compare state of entire db (in one of the threads) with"
-            "each snapshot.");
-
-DEFINE_uint64(snapshot_hold_ops, 0,
-              "If non-zero, then releases snapshots N operations after they're "
-              "acquired.");
-
-static bool ValidateInt32Percent(const char* flagname, int32_t value) {
-  if (value < 0 || value>100) {
-    fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-
-DEFINE_int32(readpercent, 10,
-             "Ratio of reads to total workload (expressed as a percentage)");
-static const bool FLAGS_readpercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
-
-DEFINE_int32(prefixpercent, 20,
-             "Ratio of prefix iterators to total workload (expressed as a"
-             " percentage)");
-static const bool FLAGS_prefixpercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
-
-DEFINE_int32(writepercent, 45,
-             "Ratio of writes to total workload (expressed as a percentage)");
-static const bool FLAGS_writepercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
-
-DEFINE_int32(delpercent, 15,
-             "Ratio of deletes to total workload (expressed as a percentage)");
-static const bool FLAGS_delpercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
-
-DEFINE_int32(delrangepercent, 0,
-             "Ratio of range deletions to total workload (expressed as a "
-             "percentage). Cannot be used with test_batches_snapshots");
-static const bool FLAGS_delrangepercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_delrangepercent, &ValidateInt32Percent);
-
-DEFINE_int32(nooverwritepercent, 60,
-             "Ratio of keys without overwrite to total workload (expressed as "
-             " a percentage)");
-static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent);
-
-DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
-             " (expressed as a percentage)");
-static const bool FLAGS_iterpercent_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
-
-DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
-static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
-
-namespace {
-enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
-  assert(ctype);
-
-  if (!strcasecmp(ctype, "none"))
-    return rocksdb::kNoCompression;
-  else if (!strcasecmp(ctype, "snappy"))
-    return rocksdb::kSnappyCompression;
-  else if (!strcasecmp(ctype, "zlib"))
-    return rocksdb::kZlibCompression;
-  else if (!strcasecmp(ctype, "bzip2"))
-    return rocksdb::kBZip2Compression;
-  else if (!strcasecmp(ctype, "lz4"))
-    return rocksdb::kLZ4Compression;
-  else if (!strcasecmp(ctype, "lz4hc"))
-    return rocksdb::kLZ4HCCompression;
-  else if (!strcasecmp(ctype, "xpress"))
-    return rocksdb::kXpressCompression;
-  else if (!strcasecmp(ctype, "zstd"))
-    return rocksdb::kZSTD;
-
-  fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
-  return rocksdb::kSnappyCompression; //default value
-}
-
-enum rocksdb::ChecksumType StringToChecksumType(const char* ctype) {
-  assert(ctype);
-  auto iter = rocksdb::checksum_type_string_map.find(ctype);
-  if (iter != rocksdb::checksum_type_string_map.end()) {
-    return iter->second;
-  }
-  fprintf(stderr, "Cannot parse checksum type '%s'\n", ctype);
-  return rocksdb::kCRC32c;
-}
-
-std::string ChecksumTypeToString(rocksdb::ChecksumType ctype) {
-  auto iter = std::find_if(
-      rocksdb::checksum_type_string_map.begin(),
-      rocksdb::checksum_type_string_map.end(),
-      [&](const std::pair<std::string, rocksdb::ChecksumType>&
-              name_and_enum_val) { return name_and_enum_val.second == ctype; });
-  assert(iter != rocksdb::checksum_type_string_map.end());
-  return iter->first;
-}
-
-std::vector<std::string> SplitString(std::string src) {
-  std::vector<std::string> ret;
-  if (src.empty()) {
-    return ret;
-  }
-  size_t pos = 0;
-  size_t pos_comma;
-  while ((pos_comma = src.find(',', pos)) != std::string::npos) {
-    ret.push_back(src.substr(pos, pos_comma - pos));
-    pos = pos_comma + 1;
-  }
-  ret.push_back(src.substr(pos, src.length()));
-  return ret;
-}
-}  // namespace
-
-DEFINE_string(compression_type, "snappy",
-              "Algorithm to use to compress the database");
-static enum rocksdb::CompressionType FLAGS_compression_type_e =
-    rocksdb::kSnappyCompression;
-
-DEFINE_int32(compression_max_dict_bytes, 0,
-             "Maximum size of dictionary used to prime the compression "
-             "library.");
-
-DEFINE_int32(compression_zstd_max_train_bytes, 0,
-             "Maximum size of training data passed to zstd's dictionary "
-             "trainer.");
-
-DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
-static enum rocksdb::ChecksumType FLAGS_checksum_type_e = rocksdb::kCRC32c;
-
-DEFINE_string(hdfs, "", "Name of hdfs environment");
-// posix or hdfs environment
-static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
-
-DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
-static const bool FLAGS_ops_per_thread_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
-
-DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
-static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
-
-DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
-
-DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
-
-enum RepFactory {
-  kSkipList,
-  kHashSkipList,
-  kVectorRep
-};
-
-namespace {
-enum RepFactory StringToRepFactory(const char* ctype) {
-  assert(ctype);
-
-  if (!strcasecmp(ctype, "skip_list"))
-    return kSkipList;
-  else if (!strcasecmp(ctype, "prefix_hash"))
-    return kHashSkipList;
-  else if (!strcasecmp(ctype, "vector"))
-    return kVectorRep;
-
-  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
-  return kSkipList;
-}
-
-#ifdef _MSC_VER
-#pragma warning(push)
-// truncation of constant value on static_cast
-#pragma warning(disable : 4309)
-#endif
-bool GetNextPrefix(const rocksdb::Slice& src, std::string* v) {
-  std::string ret = src.ToString();
-  for (int i = static_cast<int>(ret.size()) - 1; i >= 0; i--) {
-    if (ret[i] != static_cast<char>(255)) {
-      ret[i] = ret[i] + 1;
-      break;
-    } else if (i != 0) {
-      ret[i] = 0;
-    } else {
-      // all FF. No next prefix
-      return false;
-    }
-  }
-  *v = ret;
-  return true;
-}
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-}  // namespace
-
-static enum RepFactory FLAGS_rep_factory;
-DEFINE_string(memtablerep, "prefix_hash", "");
-
-static bool ValidatePrefixSize(const char* flagname, int32_t value) {
-  if (value < 0 || value > 8) {
-    fprintf(stderr, "Invalid value for --%s: %d. 0 <= PrefixSize <= 8\n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
-static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
-    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
-
-DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
-            "that behaves like a Put");
-
-DEFINE_bool(use_full_merge_v1, false,
-            "On true, use a merge operator that implement the deprecated "
-            "version of FullMerge");
-
-namespace rocksdb {
-
-// convert long to a big-endian slice key
-static std::string Key(int64_t val) {
-  std::string little_endian_key;
-  std::string big_endian_key;
-  PutFixed64(&little_endian_key, val);
-  assert(little_endian_key.size() == sizeof(val));
-  big_endian_key.resize(sizeof(val));
-  for (size_t i = 0 ; i < sizeof(val); ++i) {
-    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
-  }
-  return big_endian_key;
-}
-
-static bool GetIntVal(std::string big_endian_key, uint64_t *key_p) {
-  unsigned int size_key = sizeof(*key_p);
-  assert(big_endian_key.size() == size_key);
-  std::string little_endian_key;
-  little_endian_key.resize(size_key);
-  for (size_t i = 0 ; i < size_key; ++i) {
-    little_endian_key[i] = big_endian_key[size_key - 1 - i];
-  }
-  Slice little_endian_slice = Slice(little_endian_key);
-  return GetFixed64(&little_endian_slice, key_p);
-}
-
-static std::string StringToHex(const std::string& str) {
-  std::string result = "0x";
-  result.append(Slice(str).ToString(true));
-  return result;
-}
-
-
-class StressTest;
-namespace {
-
-class Stats {
- private:
-  uint64_t start_;
-  uint64_t finish_;
-  double  seconds_;
-  long done_;
-  long gets_;
-  long prefixes_;
-  long writes_;
-  long deletes_;
-  size_t single_deletes_;
-  long iterator_size_sums_;
-  long founds_;
-  long iterations_;
-  long range_deletions_;
-  long covered_by_range_deletions_;
-  long errors_;
-  long num_compact_files_succeed_;
-  long num_compact_files_failed_;
-  int next_report_;
-  size_t bytes_;
-  uint64_t last_op_finish_;
-  HistogramImpl hist_;
-
- public:
-  Stats() { }
-
-  void Start() {
-    next_report_ = 100;
-    hist_.Clear();
-    done_ = 0;
-    gets_ = 0;
-    prefixes_ = 0;
-    writes_ = 0;
-    deletes_ = 0;
-    single_deletes_ = 0;
-    iterator_size_sums_ = 0;
-    founds_ = 0;
-    iterations_ = 0;
-    range_deletions_ = 0;
-    covered_by_range_deletions_ = 0;
-    errors_ = 0;
-    bytes_ = 0;
-    seconds_ = 0;
-    num_compact_files_succeed_ = 0;
-    num_compact_files_failed_ = 0;
-    start_ = FLAGS_env->NowMicros();
-    last_op_finish_ = start_;
-    finish_ = start_;
-  }
-
-  void Merge(const Stats& other) {
-    hist_.Merge(other.hist_);
-    done_ += other.done_;
-    gets_ += other.gets_;
-    prefixes_ += other.prefixes_;
-    writes_ += other.writes_;
-    deletes_ += other.deletes_;
-    single_deletes_ += other.single_deletes_;
-    iterator_size_sums_ += other.iterator_size_sums_;
-    founds_ += other.founds_;
-    iterations_ += other.iterations_;
-    range_deletions_ += other.range_deletions_;
-    covered_by_range_deletions_ = other.covered_by_range_deletions_;
-    errors_ += other.errors_;
-    bytes_ += other.bytes_;
-    seconds_ += other.seconds_;
-    num_compact_files_succeed_ += other.num_compact_files_succeed_;
-    num_compact_files_failed_ += other.num_compact_files_failed_;
-    if (other.start_ < start_) start_ = other.start_;
-    if (other.finish_ > finish_) finish_ = other.finish_;
-  }
-
-  void Stop() {
-    finish_ = FLAGS_env->NowMicros();
-    seconds_ = (finish_ - start_) * 1e-6;
-  }
-
-  void FinishedSingleOp() {
-    if (FLAGS_histogram) {
-      auto now = FLAGS_env->NowMicros();
-      auto micros = now - last_op_finish_;
-      hist_.Add(micros);
-      if (micros > 20000) {
-        fprintf(stdout, "long op: %" PRIu64 " micros%30s\r", micros, "");
-      }
-      last_op_finish_ = now;
-    }
-
-      done_++;
-    if (FLAGS_progress_reports) {
-      if (done_ >= next_report_) {
-        if      (next_report_ < 1000)   next_report_ += 100;
-        else if (next_report_ < 5000)   next_report_ += 500;
-        else if (next_report_ < 10000)  next_report_ += 1000;
-        else if (next_report_ < 50000)  next_report_ += 5000;
-        else if (next_report_ < 100000) next_report_ += 10000;
-        else if (next_report_ < 500000) next_report_ += 50000;
-        else                            next_report_ += 100000;
-        fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
-      }
-    }
-  }
-
-  void AddBytesForWrites(long nwrites, size_t nbytes) {
-    writes_ += nwrites;
-    bytes_ += nbytes;
-  }
-
-  void AddGets(long ngets, long nfounds) {
-    founds_ += nfounds;
-    gets_ += ngets;
-  }
-
-  void AddPrefixes(long nprefixes, long count) {
-    prefixes_ += nprefixes;
-    iterator_size_sums_ += count;
-  }
-
-  void AddIterations(long n) { iterations_ += n; }
-
-  void AddDeletes(long n) { deletes_ += n; }
-
-  void AddSingleDeletes(size_t n) { single_deletes_ += n; }
-
-  void AddRangeDeletions(long n) { range_deletions_ += n; }
-
-  void AddCoveredByRangeDeletions(long n) { covered_by_range_deletions_ += n; }
-
-  void AddErrors(long n) { errors_ += n; }
-
-  void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; }
-
-  void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; }
-
-  void Report(const char* name) {
-    std::string extra;
-    if (bytes_ < 1 || done_ < 1) {
-      fprintf(stderr, "No writes or ops?\n");
-      return;
-    }
-
-    double elapsed = (finish_ - start_) * 1e-6;
-    double bytes_mb = bytes_ / 1048576.0;
-    double rate = bytes_mb / elapsed;
-    double throughput = (double)done_/elapsed;
-
-    fprintf(stdout, "%-12s: ", name);
-    fprintf(stdout, "%.3f micros/op %ld ops/sec\n",
-            seconds_ * 1e6 / done_, (long)throughput);
-    fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
-            "", bytes_mb, rate, (100*writes_)/done_, done_);
-    fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
-    fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
-    fprintf(stdout, "%-12s: Single deleted %" ROCKSDB_PRIszt " times\n", "",
-           single_deletes_);
-    fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "",
-            gets_, founds_);
-    fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
-    fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
-            iterator_size_sums_);
-    fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
-    fprintf(stdout, "%-12s: Deleted %ld key-ranges\n", "", range_deletions_);
-    fprintf(stdout, "%-12s: Range deletions covered %ld keys\n", "",
-            covered_by_range_deletions_);
-
-    fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
-    fprintf(stdout, "%-12s: %ld CompactFiles() succeed\n", "",
-            num_compact_files_succeed_);
-    fprintf(stdout, "%-12s: %ld CompactFiles() did not succeed\n", "",
-            num_compact_files_failed_);
-
-    if (FLAGS_histogram) {
-      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
-    }
-    fflush(stdout);
-  }
-};
-
-// State shared by all concurrent executions of the same benchmark.
-class SharedState {
- public:
-  // indicates a key may have any value (or not be present) as an operation on
-  // it is incomplete.
-  static const uint32_t UNKNOWN_SENTINEL;
-  // indicates a key should definitely be deleted
-  static const uint32_t DELETION_SENTINEL;
-
-  explicit SharedState(StressTest* stress_test)
-      : cv_(&mu_),
-        seed_(static_cast<uint32_t>(FLAGS_seed)),
-        max_key_(FLAGS_max_key),
-        log2_keys_per_lock_(static_cast<uint32_t>(FLAGS_log2_keys_per_lock)),
-        num_threads_(FLAGS_threads),
-        num_initialized_(0),
-        num_populated_(0),
-        vote_reopen_(0),
-        num_done_(0),
-        start_(false),
-        start_verify_(false),
-        should_stop_bg_thread_(false),
-        bg_thread_finished_(false),
-        stress_test_(stress_test),
-        verification_failure_(false),
-        no_overwrite_ids_(FLAGS_column_families),
-        values_(nullptr) {
-    // Pick random keys in each column family that will not experience
-    // overwrite
-
-    printf("Choosing random keys with no overwrite\n");
-    Random64 rnd(seed_);
-    // Start with the identity permutation. Subsequent iterations of
-    // for loop below will start with perm of previous for loop
-    int64_t *permutation = new int64_t[max_key_];
-    for (int64_t i = 0; i < max_key_; i++) {
-      permutation[i] = i;
-    }
-    // Now do the Knuth shuffle
-    int64_t num_no_overwrite_keys = (max_key_ * FLAGS_nooverwritepercent) / 100;
-    // Only need to figure out first num_no_overwrite_keys of permutation
-    no_overwrite_ids_.reserve(num_no_overwrite_keys);
-    for (int64_t i = 0; i < num_no_overwrite_keys; i++) {
-      int64_t rand_index = i + rnd.Next() % (max_key_ - i);
-      // Swap i and rand_index;
-      int64_t temp = permutation[i];
-      permutation[i] = permutation[rand_index];
-      permutation[rand_index] = temp;
-      // Fill no_overwrite_ids_ with the first num_no_overwrite_keys of
-      // permutation
-      no_overwrite_ids_.insert(permutation[i]);
-    }
-    delete[] permutation;
-
-    size_t expected_values_size =
-        sizeof(std::atomic<uint32_t>) * FLAGS_column_families * max_key_;
-    bool values_init_needed = false;
-    Status status;
-    if (!FLAGS_expected_values_path.empty()) {
-      if (!std::atomic<uint32_t>{}.is_lock_free()) {
-        status = Status::InvalidArgument(
-            "Cannot use --expected_values_path on platforms without lock-free "
-            "std::atomic<uint32_t>");
-      }
-      if (status.ok() && FLAGS_clear_column_family_one_in > 0) {
-        status = Status::InvalidArgument(
-            "Cannot use --expected_values_path on when "
-            "--clear_column_family_one_in is greater than zero.");
-      }
-      uint64_t size = 0;
-      if (status.ok()) {
-        status = FLAGS_env->GetFileSize(FLAGS_expected_values_path, &size);
-      }
-      std::unique_ptr<WritableFile> wfile;
-      if (status.ok() && size == 0) {
-        const EnvOptions soptions;
-        status = FLAGS_env->NewWritableFile(FLAGS_expected_values_path, &wfile,
-                                            soptions);
-      }
-      if (status.ok() && size == 0) {
-        std::string buf(expected_values_size, '\0');
-        status = wfile->Append(buf);
-        values_init_needed = true;
-      }
-      if (status.ok()) {
-        status = FLAGS_env->NewMemoryMappedFileBuffer(
-            FLAGS_expected_values_path, &expected_mmap_buffer_);
-      }
-      if (status.ok()) {
-        assert(expected_mmap_buffer_->GetLen() == expected_values_size);
-        values_ =
-            static_cast<std::atomic<uint32_t>*>(expected_mmap_buffer_->GetBase());
-        assert(values_ != nullptr);
-      } else {
-        fprintf(stderr, "Failed opening shared file '%s' with error: %s\n",
-                FLAGS_expected_values_path.c_str(), status.ToString().c_str());
-        assert(values_ == nullptr);
-      }
-    }
-    if (values_ == nullptr) {
-      values_allocation_.reset(
-          new std::atomic<uint32_t>[FLAGS_column_families * max_key_]);
-      values_ = &values_allocation_[0];
-      values_init_needed = true;
-    }
-    assert(values_ != nullptr);
-    if (values_init_needed) {
-      for (int i = 0; i < FLAGS_column_families; ++i) {
-        for (int j = 0; j < max_key_; ++j) {
-          Delete(i, j, false /* pending */);
-        }
-      }
-    }
-
-    if (FLAGS_test_batches_snapshots) {
-      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
-      return;
-    }
-
-    long num_locks = static_cast<long>(max_key_ >> log2_keys_per_lock_);
-    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
-      num_locks++;
-    }
-    fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
-    key_locks_.resize(FLAGS_column_families);
-
-    for (int i = 0; i < FLAGS_column_families; ++i) {
-      key_locks_[i].resize(num_locks);
-      for (auto& ptr : key_locks_[i]) {
-        ptr.reset(new port::Mutex);
-      }
-    }
-  }
-
-  ~SharedState() {}
-
-  port::Mutex* GetMutex() {
-    return &mu_;
-  }
-
-  port::CondVar* GetCondVar() {
-    return &cv_;
-  }
-
-  StressTest* GetStressTest() const {
-    return stress_test_;
-  }
-
-  int64_t GetMaxKey() const {
-    return max_key_;
-  }
-
-  uint32_t GetNumThreads() const {
-    return num_threads_;
-  }
-
-  void IncInitialized() {
-    num_initialized_++;
-  }
-
-  void IncOperated() {
-    num_populated_++;
-  }
-
-  void IncDone() {
-    num_done_++;
-  }
-
-  void IncVotedReopen() {
-    vote_reopen_ = (vote_reopen_ + 1) % num_threads_;
-  }
-
-  bool AllInitialized() const {
-    return num_initialized_ >= num_threads_;
-  }
-
-  bool AllOperated() const {
-    return num_populated_ >= num_threads_;
-  }
-
-  bool AllDone() const {
-    return num_done_ >= num_threads_;
-  }
-
-  bool AllVotedReopen() {
-    return (vote_reopen_ == 0);
-  }
-
-  void SetStart() {
-    start_ = true;
-  }
-
-  void SetStartVerify() {
-    start_verify_ = true;
-  }
-
-  bool Started() const {
-    return start_;
-  }
-
-  bool VerifyStarted() const {
-    return start_verify_;
-  }
-
-  void SetVerificationFailure() { verification_failure_.store(true); }
-
-  bool HasVerificationFailedYet() { return verification_failure_.load(); }
-
-  port::Mutex* GetMutexForKey(int cf, int64_t key) {
-    return key_locks_[cf][key >> log2_keys_per_lock_].get();
-  }
-
-  void LockColumnFamily(int cf) {
-    for (auto& mutex : key_locks_[cf]) {
-      mutex->Lock();
-    }
-  }
-
-  void UnlockColumnFamily(int cf) {
-    for (auto& mutex : key_locks_[cf]) {
-      mutex->Unlock();
-    }
-  }
-
-  std::atomic<uint32_t>& Value(int cf, int64_t key) const {
-    return values_[cf * max_key_ + key];
-  }
-
-  void ClearColumnFamily(int cf) {
-    std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */),
-              DELETION_SENTINEL);
-  }
-
-  // @param pending True if the update may have started but is not yet
-  //    guaranteed finished. This is useful for crash-recovery testing when the
-  //    process may crash before updating the expected values array.
-  void Put(int cf, int64_t key, uint32_t value_base, bool pending) {
-    if (!pending) {
-      // prevent expected-value update from reordering before Write
-      std::atomic_thread_fence(std::memory_order_release);
-    }
-    Value(cf, key).store(pending ? UNKNOWN_SENTINEL : value_base,
-                         std::memory_order_relaxed);
-    if (pending) {
-      // prevent Write from reordering before expected-value update
-      std::atomic_thread_fence(std::memory_order_release);
-    }
-  }
-
-  uint32_t Get(int cf, int64_t key) const { return Value(cf, key); }
-
-  // @param pending See comment above Put()
-  // Returns true if the key was not yet deleted.
-  bool Delete(int cf, int64_t key, bool pending) {
-    if (Value(cf, key) == DELETION_SENTINEL) {
-      return false;
-    }
-    Put(cf, key, DELETION_SENTINEL, pending);
-    return true;
-  }
-
-  // @param pending See comment above Put()
-  // Returns true if the key was not yet deleted.
-  bool SingleDelete(int cf, int64_t key, bool pending) {
-    return Delete(cf, key, pending);
-  }
-
-  // @param pending See comment above Put()
-  // Returns number of keys deleted by the call.
-  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) {
-    int covered = 0;
-    for (int64_t key = begin_key; key < end_key; ++key) {
-      if (Delete(cf, key, pending)) {
-        ++covered;
-      }
-    }
-    return covered;
-  }
-
-  bool AllowsOverwrite(int64_t key) {
-    return no_overwrite_ids_.find(key) == no_overwrite_ids_.end();
-  }
-
-  bool Exists(int cf, int64_t key) {
-    // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite
-    // is disallowed can't be accidentally added a second time, in which case
-    // SingleDelete wouldn't be able to properly delete the key. It does allow
-    // the case where a SingleDelete might be added which covers nothing, but
-    // that's not a correctness issue.
-    uint32_t expected_value = Value(cf, key).load();
-    return expected_value != DELETION_SENTINEL;
-  }
-
-  uint32_t GetSeed() const { return seed_; }
-
-  void SetShouldStopBgThread() { should_stop_bg_thread_ = true; }
-
-  bool ShoudStopBgThread() { return should_stop_bg_thread_; }
-
-  void SetBgThreadFinish() { bg_thread_finished_ = true; }
-
-  bool BgThreadFinished() const { return bg_thread_finished_; }
-
-  bool ShouldVerifyAtBeginning() const {
-    return expected_mmap_buffer_.get() != nullptr;
-  }
-
- private:
-  port::Mutex mu_;
-  port::CondVar cv_;
-  const uint32_t seed_;
-  const int64_t max_key_;
-  const uint32_t log2_keys_per_lock_;
-  const int num_threads_;
-  long num_initialized_;
-  long num_populated_;
-  long vote_reopen_;
-  long num_done_;
-  bool start_;
-  bool start_verify_;
-  bool should_stop_bg_thread_;
-  bool bg_thread_finished_;
-  StressTest* stress_test_;
-  std::atomic<bool> verification_failure_;
-
-  // Keys that should not be overwritten
-  std::unordered_set<size_t> no_overwrite_ids_;
-
-  std::atomic<uint32_t>* values_;
-  std::unique_ptr<std::atomic<uint32_t>[]> values_allocation_;
-  // Has to make it owned by a smart ptr as port::Mutex is not copyable
-  // and storing it in the container may require copying depending on the impl.
-  std::vector<std::vector<std::unique_ptr<port::Mutex> > > key_locks_;
-  std::unique_ptr<MemoryMappedFileBuffer> expected_mmap_buffer_;
-};
-
-const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe;
-const uint32_t SharedState::DELETION_SENTINEL = 0xffffffff;
-
-// Per-thread state for concurrent executions of the same benchmark.
-struct ThreadState {
-  uint32_t tid;  // 0..n-1
-  Random rand;   // Has different seeds for different threads
-  SharedState* shared;
-  Stats stats;
-  struct SnapshotState {
-    const Snapshot* snapshot;
-    // The cf from which we did a Get at this snapshot
-    int cf_at;
-    // The name of the cf at the time that we did a read
-    std::string cf_at_name;
-    // The key with which we did a Get at this snapshot
-    std::string key;
-    // The status of the Get
-    Status status;
-    // The value of the Get
-    std::string value;
-    // optional state of all keys in the db
-    std::vector<bool> *key_vec;
-  };
-  std::queue<std::pair<uint64_t, SnapshotState> > snapshot_queue;
-
-  ThreadState(uint32_t index, SharedState* _shared)
-      : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {}
-};
-
-class DbStressListener : public EventListener {
- public:
-  DbStressListener(const std::string& db_name,
-                   const std::vector<DbPath>& db_paths,
-                   const std::vector<ColumnFamilyDescriptor>& column_families)
-      : db_name_(db_name),
-        db_paths_(db_paths),
-        column_families_(column_families),
-        num_pending_file_creations_(0) {}
-  virtual ~DbStressListener() {
-    assert(num_pending_file_creations_ == 0);
-  }
-#ifndef ROCKSDB_LITE
-  virtual void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
-    assert(IsValidColumnFamilyName(info.cf_name));
-    VerifyFilePath(info.file_path);
-    // pretending doing some work here
-    std::this_thread::sleep_for(
-        std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
-  }
-
-  virtual void OnCompactionCompleted(DB* /*db*/,
-                                     const CompactionJobInfo& ci) override {
-    assert(IsValidColumnFamilyName(ci.cf_name));
-    assert(ci.input_files.size() + ci.output_files.size() > 0U);
-    for (const auto& file_path : ci.input_files) {
-      VerifyFilePath(file_path);
-    }
-    for (const auto& file_path : ci.output_files) {
-      VerifyFilePath(file_path);
-    }
-    // pretending doing some work here
-    std::this_thread::sleep_for(
-        std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
-  }
-
-  virtual void OnTableFileCreationStarted(
-      const TableFileCreationBriefInfo& /*info*/) override {
-    ++num_pending_file_creations_;
-  }
-  virtual void OnTableFileCreated(const TableFileCreationInfo& info) override {
-    assert(info.db_name == db_name_);
-    assert(IsValidColumnFamilyName(info.cf_name));
-    if (info.file_size) {
-      VerifyFilePath(info.file_path);
-    }
-    assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0);
-    if (info.status.ok() && info.file_size > 0) {
-      assert(info.table_properties.data_size > 0 ||
-             info.table_properties.num_range_deletions > 0);
-      assert(info.table_properties.raw_key_size > 0);
-      assert(info.table_properties.num_entries > 0);
-    }
-    --num_pending_file_creations_;
-  }
-
- protected:
-  bool IsValidColumnFamilyName(const std::string& cf_name) const {
-    if (cf_name == kDefaultColumnFamilyName) {
-      return true;
-    }
-    // The column family names in the stress tests are numbers.
-    for (size_t i = 0; i < cf_name.size(); ++i) {
-      if (cf_name[i] < '0' || cf_name[i] > '9') {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void VerifyFileDir(const std::string& file_dir) {
-#ifndef NDEBUG
-    if (db_name_ == file_dir) {
-      return;
-    }
-    for (const auto& db_path : db_paths_) {
-      if (db_path.path == file_dir) {
-        return;
-      }
-    }
-    for (auto& cf : column_families_) {
-      for (const auto& cf_path : cf.options.cf_paths) {
-        if (cf_path.path == file_dir) {
-            return;
-        }
-      }
-    }
-    assert(false);
-#else
-    (void)file_dir;
-#endif  // !NDEBUG
-  }
-
-  void VerifyFileName(const std::string& file_name) {
-#ifndef NDEBUG
-    uint64_t file_number;
-    FileType file_type;
-    bool result = ParseFileName(file_name, &file_number, &file_type);
-    assert(result);
-    assert(file_type == kTableFile);
-#else
-    (void)file_name;
-#endif  // !NDEBUG
-  }
-
-  void VerifyFilePath(const std::string& file_path) {
-#ifndef NDEBUG
-    size_t pos = file_path.find_last_of("/");
-    if (pos == std::string::npos) {
-      VerifyFileName(file_path);
-    } else {
-      if (pos > 0) {
-        VerifyFileDir(file_path.substr(0, pos));
-      }
-      VerifyFileName(file_path.substr(pos));
-    }
-#else
-    (void)file_path;
-#endif  // !NDEBUG
-  }
-#endif  // !ROCKSDB_LITE
-
- private:
-  std::string db_name_;
-  std::vector<DbPath> db_paths_;
-  std::vector<ColumnFamilyDescriptor> column_families_;
-  std::atomic<int> num_pending_file_creations_;
-};
-
-}  // namespace
-
-class StressTest {
- public:
-  StressTest()
-      : cache_(NewCache(FLAGS_cache_size)),
-        compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size)),
-        filter_policy_(FLAGS_bloom_bits >= 0
-                           ? FLAGS_use_block_based_filter
-                                 ? NewBloomFilterPolicy(FLAGS_bloom_bits, true)
-                                 : NewBloomFilterPolicy(FLAGS_bloom_bits, false)
-                           : nullptr),
-        db_(nullptr),
-#ifndef ROCKSDB_LITE
-        txn_db_(nullptr),
-#endif
-        new_column_family_name_(1),
-        num_times_reopened_(0),
-        db_preload_finished_(false) {
-    if (FLAGS_destroy_db_initially) {
-      std::vector<std::string> files;
-      FLAGS_env->GetChildren(FLAGS_db, &files);
-      for (unsigned int i = 0; i < files.size(); i++) {
-        if (Slice(files[i]).starts_with("heap-")) {
-          FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
-        }
-      }
-      Options options;
-      options.env = FLAGS_env;
-      Status s = DestroyDB(FLAGS_db, options);
-      if (!s.ok()) {
-        fprintf(stderr, "Cannot destroy original db: %s\n",
-                s.ToString().c_str());
-        exit(1);
-      }
-    }
-  }
-
-  virtual ~StressTest() {
-    for (auto cf : column_families_) {
-      delete cf;
-    }
-    column_families_.clear();
-    delete db_;
-  }
-
-  std::shared_ptr<Cache> NewCache(size_t capacity) {
-    if (capacity <= 0) {
-      return nullptr;
-    }
-    if (FLAGS_use_clock_cache) {
-      auto cache = NewClockCache((size_t)capacity);
-      if (!cache) {
-        fprintf(stderr, "Clock cache not supported.");
-        exit(1);
-      }
-      return cache;
-    } else {
-      return NewLRUCache((size_t)capacity);
-    }
-  }
-
-  bool BuildOptionsTable() {
-    if (FLAGS_set_options_one_in <= 0) {
-      return true;
-    }
-
-    std::unordered_map<std::string, std::vector<std::string> > options_tbl = {
-        {"write_buffer_size",
-         {ToString(options_.write_buffer_size),
-          ToString(options_.write_buffer_size * 2),
-          ToString(options_.write_buffer_size * 4)}},
-        {"max_write_buffer_number",
-         {ToString(options_.max_write_buffer_number),
-          ToString(options_.max_write_buffer_number * 2),
-          ToString(options_.max_write_buffer_number * 4)}},
-        {"arena_block_size",
-         {
-             ToString(options_.arena_block_size),
-             ToString(options_.write_buffer_size / 4),
-             ToString(options_.write_buffer_size / 8),
-         }},
-        {"memtable_huge_page_size", {"0", ToString(2 * 1024 * 1024)}},
-        {"max_successive_merges", {"0", "2", "4"}},
-        {"inplace_update_num_locks", {"100", "200", "300"}},
-        // TODO(ljin): enable test for this option
-        // {"disable_auto_compactions", {"100", "200", "300"}},
-        {"soft_rate_limit", {"0", "0.5", "0.9"}},
-        {"hard_rate_limit", {"0", "1.1", "2.0"}},
-        {"level0_file_num_compaction_trigger",
-         {
-             ToString(options_.level0_file_num_compaction_trigger),
-             ToString(options_.level0_file_num_compaction_trigger + 2),
-             ToString(options_.level0_file_num_compaction_trigger + 4),
-         }},
-        {"level0_slowdown_writes_trigger",
-         {
-             ToString(options_.level0_slowdown_writes_trigger),
-             ToString(options_.level0_slowdown_writes_trigger + 2),
-             ToString(options_.level0_slowdown_writes_trigger + 4),
-         }},
-        {"level0_stop_writes_trigger",
-         {
-             ToString(options_.level0_stop_writes_trigger),
-             ToString(options_.level0_stop_writes_trigger + 2),
-             ToString(options_.level0_stop_writes_trigger + 4),
-         }},
-        {"max_compaction_bytes",
-         {
-             ToString(options_.target_file_size_base * 5),
-             ToString(options_.target_file_size_base * 15),
-             ToString(options_.target_file_size_base * 100),
-         }},
-        {"target_file_size_base",
-         {
-             ToString(options_.target_file_size_base),
-             ToString(options_.target_file_size_base * 2),
-             ToString(options_.target_file_size_base * 4),
-         }},
-        {"target_file_size_multiplier",
-         {
-             ToString(options_.target_file_size_multiplier), "1", "2",
-         }},
-        {"max_bytes_for_level_base",
-         {
-             ToString(options_.max_bytes_for_level_base / 2),
-             ToString(options_.max_bytes_for_level_base),
-             ToString(options_.max_bytes_for_level_base * 2),
-         }},
-        {"max_bytes_for_level_multiplier",
-         {
-             ToString(options_.max_bytes_for_level_multiplier), "1", "2",
-         }},
-        {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
-    };
-
-    options_table_ = std::move(options_tbl);
-
-    for (const auto& iter : options_table_) {
-      options_index_.push_back(iter.first);
-    }
-    return true;
-  }
-
-  bool Run() {
-    uint64_t now = FLAGS_env->NowMicros();
-    fprintf(stdout, "%s Initializing db_stress\n",
-            FLAGS_env->TimeToString(now / 1000000).c_str());
-    PrintEnv();
-    Open();
-    BuildOptionsTable();
-    SharedState shared(this);
-
-    if (FLAGS_read_only) {
-      now = FLAGS_env->NowMicros();
-      fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n",
-              FLAGS_env->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
-      PreloadDbAndReopenAsReadOnly(FLAGS_max_key, &shared);
-    }
-    uint32_t n = shared.GetNumThreads();
-
-    now = FLAGS_env->NowMicros();
-    fprintf(stdout, "%s Initializing worker threads\n",
-            FLAGS_env->TimeToString(now / 1000000).c_str());
-    std::vector<ThreadState*> threads(n);
-    for (uint32_t i = 0; i < n; i++) {
-      threads[i] = new ThreadState(i, &shared);
-      FLAGS_env->StartThread(ThreadBody, threads[i]);
-    }
-    ThreadState bg_thread(0, &shared);
-    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
-      FLAGS_env->StartThread(PoolSizeChangeThread, &bg_thread);
-    }
-
-    // Each thread goes through the following states:
-    // initializing -> wait for others to init -> read/populate/depopulate
-    // wait for others to operate -> verify -> done
-
-    {
-      MutexLock l(shared.GetMutex());
-      while (!shared.AllInitialized()) {
-        shared.GetCondVar()->Wait();
-      }
-      if (shared.ShouldVerifyAtBeginning()) {
-        if (shared.HasVerificationFailedYet()) {
-          printf("Crash-recovery verification failed :(\n");
-        } else {
-          printf("Crash-recovery verification passed :)\n");
-        }
-      }
-
-      now = FLAGS_env->NowMicros();
-      fprintf(stdout, "%s Starting database operations\n",
-              FLAGS_env->TimeToString(now/1000000).c_str());
-
-      shared.SetStart();
-      shared.GetCondVar()->SignalAll();
-      while (!shared.AllOperated()) {
-        shared.GetCondVar()->Wait();
-      }
-
-      now = FLAGS_env->NowMicros();
-      if (FLAGS_test_batches_snapshots) {
-        fprintf(stdout, "%s Limited verification already done during gets\n",
-                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
-      } else {
-        fprintf(stdout, "%s Starting verification\n",
-                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
-      }
-
-      shared.SetStartVerify();
-      shared.GetCondVar()->SignalAll();
-      while (!shared.AllDone()) {
-        shared.GetCondVar()->Wait();
-      }
-    }
-
-    for (unsigned int i = 1; i < n; i++) {
-      threads[0]->stats.Merge(threads[i]->stats);
-    }
-    threads[0]->stats.Report("Stress Test");
-
-    for (unsigned int i = 0; i < n; i++) {
-      delete threads[i];
-      threads[i] = nullptr;
-    }
-    now = FLAGS_env->NowMicros();
-    if (!FLAGS_test_batches_snapshots && !shared.HasVerificationFailedYet()) {
-      fprintf(stdout, "%s Verification successful\n",
-              FLAGS_env->TimeToString(now/1000000).c_str());
-    }
-    PrintStatistics();
-
-    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
-      MutexLock l(shared.GetMutex());
-      shared.SetShouldStopBgThread();
-      while (!shared.BgThreadFinished()) {
-        shared.GetCondVar()->Wait();
-      }
-    }
-
-    if (shared.HasVerificationFailedYet()) {
-      printf("Verification failed :(\n");
-      return false;
-    }
-    return true;
-  }
-
- protected:
-  static void ThreadBody(void* v) {
-    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
-    SharedState* shared = thread->shared;
-
-    if (shared->ShouldVerifyAtBeginning()) {
-      thread->shared->GetStressTest()->VerifyDb(thread);
-    }
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncInitialized();
-      if (shared->AllInitialized()) {
-        shared->GetCondVar()->SignalAll();
-      }
-      while (!shared->Started()) {
-        shared->GetCondVar()->Wait();
-      }
-    }
-    thread->shared->GetStressTest()->OperateDb(thread);
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncOperated();
-      if (shared->AllOperated()) {
-        shared->GetCondVar()->SignalAll();
-      }
-      while (!shared->VerifyStarted()) {
-        shared->GetCondVar()->Wait();
-      }
-    }
-
-    thread->shared->GetStressTest()->VerifyDb(thread);
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncDone();
-      if (shared->AllDone()) {
-        shared->GetCondVar()->SignalAll();
-      }
-    }
-  }
-
-  static void PoolSizeChangeThread(void* v) {
-    assert(FLAGS_compaction_thread_pool_adjust_interval > 0);
-    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
-    SharedState* shared = thread->shared;
-
-    while (true) {
-      {
-        MutexLock l(shared->GetMutex());
-        if (shared->ShoudStopBgThread()) {
-          shared->SetBgThreadFinish();
-          shared->GetCondVar()->SignalAll();
-          return;
-        }
-      }
-
-      auto thread_pool_size_base = FLAGS_max_background_compactions;
-      auto thread_pool_size_var = FLAGS_compaction_thread_pool_variations;
-      int new_thread_pool_size =
-          thread_pool_size_base - thread_pool_size_var +
-          thread->rand.Next() % (thread_pool_size_var * 2 + 1);
-      if (new_thread_pool_size < 1) {
-        new_thread_pool_size = 1;
-      }
-      FLAGS_env->SetBackgroundThreads(new_thread_pool_size);
-      // Sleep up to 3 seconds
-      FLAGS_env->SleepForMicroseconds(
-          thread->rand.Next() % FLAGS_compaction_thread_pool_adjust_interval *
-              1000 +
-          1);
-    }
-  }
-
-  static void PrintKeyValue(int cf, uint64_t key, const char* value,
-      size_t sz) {
-    if (!FLAGS_verbose) {
-      return;
-    }
-    std::string tmp;
-    tmp.reserve(sz * 2 + 16);
-    char buf[4];
-    for (size_t i = 0; i < sz; i++) {
-      snprintf(buf, 4, "%X", value[i]);
-      tmp.append(buf);
-    }
-    fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf,
-            key, sz, tmp.c_str());
-  }
-
-  static int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration) {
-    const double completed_ratio =
-        static_cast<double>(iteration) / FLAGS_ops_per_thread;
-    const int64_t base_key = static_cast<int64_t>(
-        completed_ratio * (FLAGS_max_key - FLAGS_active_width));
-    return base_key + thread->rand.Next() % FLAGS_active_width;
-  }
-
-  static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
-    size_t value_sz =
-        ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
-    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
-    (void) max_sz;
-    *((uint32_t*)v) = rand;
-    for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
-      v[i] = (char)(rand ^ i);
-    }
-    v[value_sz] = '\0';
-    return value_sz; // the size of the value set.
-  }
-
-  Status AssertSame(DB* db, ColumnFamilyHandle* cf,
-                    ThreadState::SnapshotState& snap_state) {
-    Status s;
-    if (cf->GetName() != snap_state.cf_at_name) {
-      return s;
-    }
-    ReadOptions ropt;
-    ropt.snapshot = snap_state.snapshot;
-    PinnableSlice exp_v(&snap_state.value);
-    exp_v.PinSelf();
-    PinnableSlice v;
-    s = db->Get(ropt, cf, snap_state.key, &v);
-    if (!s.ok() && !s.IsNotFound()) {
-      return s;
-    }
-    if (snap_state.status != s) {
-      return Status::Corruption(
-          "The snapshot gave inconsistent results for key " +
-          ToString(Hash(snap_state.key.c_str(), snap_state.key.size(), 0)) +
-          " in cf " + cf->GetName() + ": (" + snap_state.status.ToString() +
-          ") vs. (" + s.ToString() + ")");
-    }
-    if (s.ok()) {
-      if (exp_v != v) {
-        return Status::Corruption("The snapshot gave inconsistent values: (" +
-                                  exp_v.ToString() + ") vs. (" + v.ToString() +
-                                  ")");
-      }
-    }
-    if (snap_state.key_vec != nullptr) {
-      // When `prefix_extractor` is set, seeking to beginning and scanning
-      // across prefixes are only supported with `total_order_seek` set.
-      ropt.total_order_seek = true;
-      std::unique_ptr<Iterator> iterator(db->NewIterator(ropt));
-      std::unique_ptr<std::vector<bool>> tmp_bitvec(new std::vector<bool>(FLAGS_max_key));
-      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
-        uint64_t key_val;
-        if (GetIntVal(iterator->key().ToString(), &key_val)) {
-          (*tmp_bitvec.get())[key_val] = true;
-        }
-      }
-      if (!std::equal(snap_state.key_vec->begin(),
-                      snap_state.key_vec->end(),
-                      tmp_bitvec.get()->begin())) {
-        return Status::Corruption("Found inconsistent keys at this snapshot");
-      }
-    }
-    return Status::OK();
-  }
-
-  // Currently PreloadDb has to be single-threaded.
-  void PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
-                                    SharedState* shared) {
-    WriteOptions write_opts;
-    write_opts.disableWAL = FLAGS_disable_wal;
-    if (FLAGS_sync) {
-      write_opts.sync = true;
-    }
-    char value[100];
-    int cf_idx = 0;
-    Status s;
-    for (auto cfh : column_families_) {
-      for (int64_t k = 0; k != number_of_keys; ++k) {
-        std::string key_str = Key(k);
-        Slice key = key_str;
-        size_t sz = GenerateValue(0 /*value_base*/, value, sizeof(value));
-        Slice v(value, sz);
-        shared->Put(cf_idx, k, 0, true /* pending */);
-
-        if (FLAGS_use_merge) {
-          if (!FLAGS_use_txn) {
-            s = db_->Merge(write_opts, cfh, key, v);
-          } else {
-#ifndef ROCKSDB_LITE
-            Transaction* txn;
-            s = NewTxn(write_opts, &txn);
-            if (s.ok()) {
-              s = txn->Merge(cfh, key, v);
-              if (s.ok()) {
-                s = CommitTxn(txn);
-              }
-            }
-#endif
-          }
-        } else {
-          if (!FLAGS_use_txn) {
-            s = db_->Put(write_opts, cfh, key, v);
-          } else {
-#ifndef ROCKSDB_LITE
-            Transaction* txn;
-            s = NewTxn(write_opts, &txn);
-            if (s.ok()) {
-              s = txn->Put(cfh, key, v);
-              if (s.ok()) {
-                s = CommitTxn(txn);
-              }
-            }
-#endif
-          }
-        }
-
-        shared->Put(cf_idx, k, 0, false /* pending */);
-        if (!s.ok()) {
-          break;
-        }
-      }
-      if (!s.ok()) {
-        break;
-      }
-      ++cf_idx;
-    }
-    if (s.ok()) {
-      s = db_->Flush(FlushOptions(), column_families_);
-    }
-    if (s.ok()) {
-      for (auto cf : column_families_) {
-        delete cf;
-      }
-      column_families_.clear();
-      delete db_;
-      db_ = nullptr;
-#ifndef ROCKSDB_LITE
-      txn_db_ = nullptr;
-#endif
-
-      db_preload_finished_.store(true);
-      auto now = FLAGS_env->NowMicros();
-      fprintf(stdout, "%s Reopening database in read-only\n",
-              FLAGS_env->TimeToString(now / 1000000).c_str());
-      // Reopen as read-only, can ignore all options related to updates
-      Open();
-    } else {
-      fprintf(stderr, "Failed to preload db");
-      exit(1);
-    }
-  }
-
-  Status SetOptions(ThreadState* thread) {
-    assert(FLAGS_set_options_one_in > 0);
-    std::unordered_map<std::string, std::string> opts;
-    std::string name = options_index_[
-      thread->rand.Next() % options_index_.size()];
-    int value_idx = thread->rand.Next() % options_table_[name].size();
-    if (name == "soft_rate_limit" || name == "hard_rate_limit") {
-      opts["soft_rate_limit"] = options_table_["soft_rate_limit"][value_idx];
-      opts["hard_rate_limit"] = options_table_["hard_rate_limit"][value_idx];
-    } else if (name == "level0_file_num_compaction_trigger" ||
-               name == "level0_slowdown_writes_trigger" ||
-               name == "level0_stop_writes_trigger") {
-      opts["level0_file_num_compaction_trigger"] =
-        options_table_["level0_file_num_compaction_trigger"][value_idx];
-      opts["level0_slowdown_writes_trigger"] =
-        options_table_["level0_slowdown_writes_trigger"][value_idx];
-      opts["level0_stop_writes_trigger"] =
-        options_table_["level0_stop_writes_trigger"][value_idx];
-    } else {
-      opts[name] = options_table_[name][value_idx];
-    }
-
-    int rand_cf_idx = thread->rand.Next() % FLAGS_column_families;
-    auto cfh = column_families_[rand_cf_idx];
-    return db_->SetOptions(cfh, opts);
-  }
-
-#ifndef ROCKSDB_LITE
-  Status NewTxn(WriteOptions& write_opts, Transaction** txn) {
-    if (!FLAGS_use_txn) {
-      return Status::InvalidArgument("NewTxn when FLAGS_use_txn is not set");
-    }
-    static std::atomic<uint64_t> txn_id = {0};
-    TransactionOptions txn_options;
-    *txn = txn_db_->BeginTransaction(write_opts, txn_options);
-    auto istr = std::to_string(txn_id.fetch_add(1));
-    Status s = (*txn)->SetName("xid" + istr);
-    return s;
-  }
-
-  Status CommitTxn(Transaction* txn) {
-    if (!FLAGS_use_txn) {
-      return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set");
-    }
-    Status s = txn->Prepare();
-    if (s.ok()) {
-      s = txn->Commit();
-    }
-    delete txn;
-    return s;
-  }
-#endif
-
-  virtual void OperateDb(ThreadState* thread) {
-    ReadOptions read_opts(FLAGS_verify_checksum, true);
-    WriteOptions write_opts;
-    auto shared = thread->shared;
-    char value[100];
-    std::string from_db;
-    if (FLAGS_sync) {
-      write_opts.sync = true;
-    }
-    write_opts.disableWAL = FLAGS_disable_wal;
-    const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent;
-    const int writeBound = prefixBound + (int)FLAGS_writepercent;
-    const int delBound = writeBound + (int)FLAGS_delpercent;
-    const int delRangeBound = delBound + (int)FLAGS_delrangepercent;
-
-    thread->stats.Start();
-    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
-      if (thread->shared->HasVerificationFailedYet()) {
-        break;
-      }
-      if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) {
-        {
-          thread->stats.FinishedSingleOp();
-          MutexLock l(thread->shared->GetMutex());
-          while (!thread->snapshot_queue.empty()) {
-            db_->ReleaseSnapshot(
-                thread->snapshot_queue.front().second.snapshot);
-            delete thread->snapshot_queue.front().second.key_vec;
-            thread->snapshot_queue.pop();
-          }
-          thread->shared->IncVotedReopen();
-          if (thread->shared->AllVotedReopen()) {
-            thread->shared->GetStressTest()->Reopen();
-            thread->shared->GetCondVar()->SignalAll();
-          } else {
-            thread->shared->GetCondVar()->Wait();
-          }
-          // Commenting this out as we don't want to reset stats on each open.
-          // thread->stats.Start();
-        }
-      }
-
-      // Change Options
-      if (FLAGS_set_options_one_in > 0 &&
-          thread->rand.OneIn(FLAGS_set_options_one_in)) {
-        SetOptions(thread);
-      }
-
-      if (FLAGS_set_in_place_one_in > 0 &&
-          thread->rand.OneIn(FLAGS_set_in_place_one_in)) {
-        options_.inplace_update_support ^= options_.inplace_update_support;
-      }
-
-      MaybeClearOneColumnFamily(thread);
-
-#ifndef ROCKSDB_LITE
-      if (FLAGS_compact_files_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_compact_files_one_in) == 0) {
-        auto* random_cf =
-            column_families_[thread->rand.Next() % FLAGS_column_families];
-        rocksdb::ColumnFamilyMetaData cf_meta_data;
-        db_->GetColumnFamilyMetaData(random_cf, &cf_meta_data);
-
-        // Randomly compact up to three consecutive files from a level
-        const int kMaxRetry = 3;
-        for (int attempt = 0; attempt < kMaxRetry; ++attempt) {
-          size_t random_level = thread->rand.Uniform(
-              static_cast<int>(cf_meta_data.levels.size()));
-
-          const auto& files = cf_meta_data.levels[random_level].files;
-          if (files.size() > 0) {
-            size_t random_file_index =
-                thread->rand.Uniform(static_cast<int>(files.size()));
-            if (files[random_file_index].being_compacted) {
-              // Retry as the selected file is currently being compacted
-              continue;
-            }
-
-            std::vector<std::string> input_files;
-            input_files.push_back(files[random_file_index].name);
-            if (random_file_index > 0 &&
-                !files[random_file_index - 1].being_compacted) {
-              input_files.push_back(files[random_file_index - 1].name);
-            }
-            if (random_file_index + 1 < files.size() &&
-                !files[random_file_index + 1].being_compacted) {
-              input_files.push_back(files[random_file_index + 1].name);
-            }
-
-            size_t output_level =
-                std::min(random_level + 1, cf_meta_data.levels.size() - 1);
-            auto s =
-                db_->CompactFiles(CompactionOptions(), random_cf, input_files,
-                                  static_cast<int>(output_level));
-            if (!s.ok()) {
-              fprintf(stdout, "Unable to perform CompactFiles(): %s\n",
-                      s.ToString().c_str());
-              thread->stats.AddNumCompactFilesFailed(1);
-            } else {
-              thread->stats.AddNumCompactFilesSucceed(1);
-            }
-            break;
-          }
-        }
-      }
-#endif                // !ROCKSDB_LITE
-      int64_t rand_key = GenerateOneKey(thread, i);
-      int rand_column_family = thread->rand.Next() % FLAGS_column_families;
-      std::string keystr = Key(rand_key);
-      Slice key = keystr;
-      std::unique_ptr<MutexLock> lock;
-      if (ShouldAcquireMutexOnKey()) {
-        lock.reset(new MutexLock(
-            shared->GetMutexForKey(rand_column_family, rand_key)));
-      }
-
-      auto column_family = column_families_[rand_column_family];
-
-      if (FLAGS_compact_range_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_compact_range_one_in) == 0) {
-        int64_t end_key_num;
-        if (port::kMaxInt64 - rand_key < FLAGS_compact_range_width) {
-          end_key_num = port::kMaxInt64;
-        } else {
-          end_key_num = FLAGS_compact_range_width + rand_key;
-        }
-        std::string end_key_buf = Key(end_key_num);
-        Slice end_key(end_key_buf);
-
-        CompactRangeOptions cro;
-        cro.exclusive_manual_compaction =
-            static_cast<bool>(thread->rand.Next() % 2);
-        Status status = db_->CompactRange(cro, column_family, &key, &end_key);
-        if (!status.ok()) {
-          printf("Unable to perform CompactRange(): %s\n",
-                 status.ToString().c_str());
-        }
-      }
-
-      std::vector<int> rand_column_families =
-          GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
-
-      if (FLAGS_flush_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
-        FlushOptions flush_opts;
-        std::vector<ColumnFamilyHandle*> cfhs;
-        std::for_each(
-            rand_column_families.begin(), rand_column_families.end(),
-            [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
-        Status status = db_->Flush(flush_opts, cfhs);
-        if (!status.ok()) {
-          fprintf(stdout, "Unable to perform Flush(): %s\n",
-                  status.ToString().c_str());
-        }
-      }
-
-      std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
-
-      if (FLAGS_ingest_external_file_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_ingest_external_file_one_in) == 0) {
-        TestIngestExternalFile(thread, rand_column_families, rand_keys, lock);
-      }
-
-      if (FLAGS_backup_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_backup_one_in) == 0) {
-        Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
-        if (!s.ok()) {
-          VerificationAbort(shared, "Backup/restore gave inconsistent state",
-                            s);
-        }
-      }
-
-      if (FLAGS_checkpoint_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_checkpoint_one_in) == 0) {
-        Status s = TestCheckpoint(thread, rand_column_families, rand_keys);
-        if (!s.ok()) {
-          VerificationAbort(shared, "Checkpoint gave inconsistent state", s);
-        }
-      }
-
-      if (FLAGS_acquire_snapshot_one_in > 0 &&
-          thread->rand.Uniform(FLAGS_acquire_snapshot_one_in) == 0) {
-        auto snapshot = db_->GetSnapshot();
-        ReadOptions ropt;
-        ropt.snapshot = snapshot;
-        std::string value_at;
-        // When taking a snapshot, we also read a key from that snapshot. We
-        // will later read the same key before releasing the snapshot and verify
-        // that the results are the same.
-        auto status_at = db_->Get(ropt, column_family, key, &value_at);
-        std::vector<bool> *key_vec = nullptr;
-
-        if (FLAGS_compare_full_db_state_snapshot &&
-            (thread->tid == 0)) {
-          key_vec = new std::vector<bool>(FLAGS_max_key);
-          // When `prefix_extractor` is set, seeking to beginning and scanning
-          // across prefixes are only supported with `total_order_seek` set.
-          ropt.total_order_seek = true;
-          std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
-          for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
-            uint64_t key_val;
-            if (GetIntVal(iterator->key().ToString(), &key_val)) {
-              (*key_vec)[key_val] = true;
-            }
-          }
-        }
-
-        ThreadState::SnapshotState snap_state = {
-            snapshot, rand_column_family, column_family->GetName(),
-            keystr,   status_at,          value_at, key_vec};
-        thread->snapshot_queue.emplace(
-            std::min(FLAGS_ops_per_thread - 1, i + FLAGS_snapshot_hold_ops),
-            snap_state);
-      }
-      while (!thread->snapshot_queue.empty() &&
-          i == thread->snapshot_queue.front().first) {
-        auto snap_state = thread->snapshot_queue.front().second;
-        assert(snap_state.snapshot);
-        // Note: this is unsafe as the cf might be dropped concurrently. But it
-        // is ok since unclean cf drop is cunnrently not supported by write
-        // prepared transactions.
-        Status s =
-            AssertSame(db_, column_families_[snap_state.cf_at], snap_state);
-        if (!s.ok()) {
-          VerificationAbort(shared, "Snapshot gave inconsistent state", s);
-        }
-        db_->ReleaseSnapshot(snap_state.snapshot);
-        delete snap_state.key_vec;
-        thread->snapshot_queue.pop();
-      }
-
-      int prob_op = thread->rand.Uniform(100);
-      if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
-        // OPERATION read
-        TestGet(thread, read_opts, rand_column_families, rand_keys);
-      } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
-        // OPERATION prefix scan
-        // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
-        // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
-        // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
-        // prefix
-        TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
-      } else if (prefixBound <= prob_op && prob_op < writeBound) {
-        // OPERATION write
-        TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys,
-                value, lock);
-      } else if (writeBound <= prob_op && prob_op < delBound) {
-        // OPERATION delete
-        TestDelete(thread, write_opts, rand_column_families, rand_keys, lock);
-      } else if (delBound <= prob_op && prob_op < delRangeBound) {
-        // OPERATION delete range
-        TestDeleteRange(thread, write_opts, rand_column_families, rand_keys,
-                        lock);
-      } else {
-        // OPERATION iterate
-        TestIterate(thread, read_opts, rand_column_families, rand_keys);
-      }
-      thread->stats.FinishedSingleOp();
-    }
-
-    thread->stats.Stop();
-  }
-
-  virtual void VerifyDb(ThreadState* thread) const = 0;
-
-  virtual void MaybeClearOneColumnFamily(ThreadState* /* thread */) {}
-
-  virtual bool ShouldAcquireMutexOnKey() const { return false; }
-
-  virtual std::vector<int> GenerateColumnFamilies(
-      const int /* num_column_families */, int rand_column_family) const {
-    return {rand_column_family};
-  }
-
-  virtual std::vector<int64_t> GenerateKeys(int64_t rand_key) const {
-    return {rand_key};
-  }
-
-  virtual Status TestGet(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) = 0;
-
-  virtual Status TestPrefixScan(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) = 0;
-
-  virtual Status TestPut(ThreadState* thread,
-      WriteOptions& write_opts, const ReadOptions& read_opts,
-      const std::vector<int>& cf_ids, const std::vector<int64_t>& keys,
-      char (&value)[100], std::unique_ptr<MutexLock>& lock) = 0;
-
-  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) = 0;
-
-  virtual Status TestDeleteRange(ThreadState* thread,
-      WriteOptions& write_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) = 0;
-
-  virtual void TestIngestExternalFile(
-      ThreadState* thread, const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) = 0;
-
-  // Given a key K, this creates an iterator which scans to K and then
-  // does a random sequence of Next/Prev operations.
-  virtual Status TestIterate(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    Status s;
-    const Snapshot* snapshot = db_->GetSnapshot();
-    ReadOptions readoptionscopy = read_opts;
-    readoptionscopy.snapshot = snapshot;
-
-    std::string upper_bound_str;
-    Slice upper_bound;
-    if (thread->rand.OneIn(16)) {
-      // in 1/16 chance, set a iterator upper bound
-      int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
-      upper_bound_str = Key(rand_upper_key);
-      upper_bound = Slice(upper_bound_str);
-      // uppder_bound can be smaller than seek key, but the query itself
-      // should not crash either.
-      readoptionscopy.iterate_upper_bound = &upper_bound;
-    }
-    std::string lower_bound_str;
-    Slice lower_bound;
-    if (thread->rand.OneIn(16)) {
-      // in 1/16 chance, set a iterator lower bound
-      int64_t rand_lower_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
-      lower_bound_str = Key(rand_lower_key);
-      lower_bound = Slice(lower_bound_str);
-      // uppder_bound can be smaller than seek key, but the query itself
-      // should not crash either.
-      readoptionscopy.iterate_lower_bound = &lower_bound;
-    }
-
-    auto cfh = column_families_[rand_column_families[0]];
-    std::unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, cfh));
-
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    iter->Seek(key);
-    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
-      if (thread->rand.OneIn(2)) {
-        iter->Next();
-      } else {
-        iter->Prev();
-      }
-    }
-
-    if (s.ok()) {
-      thread->stats.AddIterations(1);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-
-    db_->ReleaseSnapshot(snapshot);
-
-    return s;
-  }
-
-#ifdef ROCKSDB_LITE
-  virtual Status TestBackupRestore(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */) {
-    assert(false);
-    fprintf(stderr,
-            "RocksDB lite does not support "
-            "TestBackupRestore\n");
-    std::terminate();
-  }
-
-  virtual Status TestCheckpoint(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */) {
-    assert(false);
-    fprintf(stderr,
-            "RocksDB lite does not support "
-            "TestCheckpoint\n");
-    std::terminate();
-  }
-#else  // ROCKSDB_LITE
-  virtual Status TestBackupRestore(ThreadState* thread,
-                                   const std::vector<int>& rand_column_families,
-                                   const std::vector<int64_t>& rand_keys) {
-    // Note the column families chosen by `rand_column_families` cannot be
-    // dropped while the locks for `rand_keys` are held. So we should not have
-    // to worry about accessing those column families throughout this function.
-    assert(rand_column_families.size() == rand_keys.size());
-    std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
-    std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
-    BackupableDBOptions backup_opts(backup_dir);
-    BackupEngine* backup_engine = nullptr;
-    Status s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
-    if (s.ok()) {
-      s = backup_engine->CreateNewBackup(db_);
-    }
-    if (s.ok()) {
-      delete backup_engine;
-      backup_engine = nullptr;
-      s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
-    }
-    if (s.ok()) {
-      s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
-                                                   restore_dir /* wal_dir */);
-    }
-    if (s.ok()) {
-      s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
-    }
-    DB* restored_db = nullptr;
-    std::vector<ColumnFamilyHandle*> restored_cf_handles;
-    if (s.ok()) {
-      Options restore_options(options_);
-      restore_options.listeners.clear();
-      std::vector<ColumnFamilyDescriptor> cf_descriptors;
-      // TODO(ajkr): `column_family_names_` is not safe to access here when
-      // `clear_column_family_one_in != 0`. But we can't easily switch to
-      // `ListColumnFamilies` to get names because it won't necessarily give
-      // the same order as `column_family_names_`.
-      assert(FLAGS_clear_column_family_one_in == 0);
-      for (auto name : column_family_names_) {
-        cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
-      }
-      s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
-                   &restored_cf_handles, &restored_db);
-    }
-    // for simplicity, currently only verifies existence/non-existence of a few
-    // keys
-    for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
-      std::string key_str = Key(rand_keys[i]);
-      Slice key = key_str;
-      std::string restored_value;
-      Status get_status = restored_db->Get(
-          ReadOptions(), restored_cf_handles[rand_column_families[i]], key,
-          &restored_value);
-      bool exists =
-          thread->shared->Exists(rand_column_families[i], rand_keys[i]);
-      if (get_status.ok()) {
-        if (!exists) {
-          s = Status::Corruption(
-              "key exists in restore but not in original db");
-        }
-      } else if (get_status.IsNotFound()) {
-        if (exists) {
-          s = Status::Corruption(
-              "key exists in original db but not in restore");
-        }
-      } else {
-        s = get_status;
-      }
-    }
-    if (backup_engine != nullptr) {
-      delete backup_engine;
-      backup_engine = nullptr;
-    }
-    if (restored_db != nullptr) {
-      for (auto* cf_handle : restored_cf_handles) {
-        restored_db->DestroyColumnFamilyHandle(cf_handle);
-      }
-      delete restored_db;
-      restored_db = nullptr;
-    }
-    if (!s.ok()) {
-      printf("A backup/restore operation failed with: %s\n",
-             s.ToString().c_str());
-    }
-    return s;
-  }
-
-  virtual Status TestCheckpoint(ThreadState* thread,
-                                const std::vector<int>& rand_column_families,
-                                const std::vector<int64_t>& rand_keys) {
-    // Note the column families chosen by `rand_column_families` cannot be
-    // dropped while the locks for `rand_keys` are held. So we should not have
-    // to worry about accessing those column families throughout this function.
-    assert(rand_column_families.size() == rand_keys.size());
-    std::string checkpoint_dir =
-        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
-    DestroyDB(checkpoint_dir, Options());
-    Checkpoint* checkpoint = nullptr;
-    Status s = Checkpoint::Create(db_, &checkpoint);
-    if (s.ok()) {
-      s = checkpoint->CreateCheckpoint(checkpoint_dir);
-    }
-    std::vector<ColumnFamilyHandle*> cf_handles;
-    DB* checkpoint_db = nullptr;
-    if (s.ok()) {
-      delete checkpoint;
-      checkpoint = nullptr;
-      Options options(options_);
-      options.listeners.clear();
-      std::vector<ColumnFamilyDescriptor> cf_descs;
-      // TODO(ajkr): `column_family_names_` is not safe to access here when
-      // `clear_column_family_one_in != 0`. But we can't easily switch to
-      // `ListColumnFamilies` to get names because it won't necessarily give
-      // the same order as `column_family_names_`.
-      if (FLAGS_clear_column_family_one_in == 0) {
-        for (const auto& name : column_family_names_) {
-          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
-        }
-        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
-                                &cf_handles, &checkpoint_db);
-      }
-    }
-    if (checkpoint_db != nullptr) {
-      for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
-        std::string key_str = Key(rand_keys[i]);
-        Slice key = key_str;
-        std::string value;
-        Status get_status = checkpoint_db->Get(
-            ReadOptions(), cf_handles[rand_column_families[i]], key, &value);
-        bool exists =
-            thread->shared->Exists(rand_column_families[i], rand_keys[i]);
-        if (get_status.ok()) {
-          if (!exists) {
-            s = Status::Corruption(
-                "key exists in checkpoint but not in original db");
-          }
-        } else if (get_status.IsNotFound()) {
-          if (exists) {
-            s = Status::Corruption(
-                "key exists in original db but not in checkpoint");
-          }
-        } else {
-          s = get_status;
-        }
-      }
-      for (auto cfh : cf_handles) {
-        delete cfh;
-      }
-      cf_handles.clear();
-      delete checkpoint_db;
-      checkpoint_db = nullptr;
-    }
-    DestroyDB(checkpoint_dir, Options());
-    if (!s.ok()) {
-      fprintf(stderr, "A checkpoint operation failed with: %s\n",
-              s.ToString().c_str());
-    }
-    return s;
-  }
-#endif  // ROCKSDB_LITE
-
-  void VerificationAbort(SharedState* shared, std::string msg, Status s) const {
-    printf("Verification failed: %s. Status is %s\n", msg.c_str(),
-           s.ToString().c_str());
-    shared->SetVerificationFailure();
-  }
-
-  void VerificationAbort(SharedState* shared, std::string msg, int cf,
-                         int64_t key) const {
-    printf("Verification failed for column family %d key %" PRIi64 ": %s\n", cf, key,
-           msg.c_str());
-    shared->SetVerificationFailure();
-  }
-
-  void PrintEnv() const {
-    fprintf(stdout, "RocksDB version           : %d.%d\n", kMajorVersion,
-            kMinorVersion);
-    fprintf(stdout, "Format version            : %d\n", FLAGS_format_version);
-    fprintf(stdout, "TransactionDB             : %s\n",
-            FLAGS_use_txn ? "true" : "false");
-    fprintf(stdout, "Read only mode            : %s\n",
-            FLAGS_read_only ? "true" : "false");
-    fprintf(stdout, "Atomic flush              : %s\n",
-            FLAGS_atomic_flush ? "true" : "false");
-    fprintf(stdout, "Column families           : %d\n", FLAGS_column_families);
-    if (!FLAGS_test_batches_snapshots) {
-      fprintf(stdout, "Clear CFs one in          : %d\n",
-              FLAGS_clear_column_family_one_in);
-    }
-    fprintf(stdout, "Number of threads         : %d\n", FLAGS_threads);
-    fprintf(stdout, "Ops per thread            : %lu\n",
-            (unsigned long)FLAGS_ops_per_thread);
-    std::string ttl_state("unused");
-    if (FLAGS_ttl > 0) {
-      ttl_state = NumberToString(FLAGS_ttl);
-    }
-    fprintf(stdout, "Time to live(sec)         : %s\n", ttl_state.c_str());
-    fprintf(stdout, "Read percentage           : %d%%\n", FLAGS_readpercent);
-    fprintf(stdout, "Prefix percentage         : %d%%\n", FLAGS_prefixpercent);
-    fprintf(stdout, "Write percentage          : %d%%\n", FLAGS_writepercent);
-    fprintf(stdout, "Delete percentage         : %d%%\n", FLAGS_delpercent);
-    fprintf(stdout, "Delete range percentage   : %d%%\n", FLAGS_delrangepercent);
-    fprintf(stdout, "No overwrite percentage   : %d%%\n",
-            FLAGS_nooverwritepercent);
-    fprintf(stdout, "Iterate percentage        : %d%%\n", FLAGS_iterpercent);
-    fprintf(stdout, "DB-write-buffer-size      : %" PRIu64 "\n",
-            FLAGS_db_write_buffer_size);
-    fprintf(stdout, "Write-buffer-size         : %d\n",
-            FLAGS_write_buffer_size);
-    fprintf(stdout, "Iterations                : %lu\n",
-            (unsigned long)FLAGS_num_iterations);
-    fprintf(stdout, "Max key                   : %lu\n",
-            (unsigned long)FLAGS_max_key);
-    fprintf(stdout, "Ratio #ops/#keys          : %f\n",
-            (1.0 * FLAGS_ops_per_thread * FLAGS_threads) / FLAGS_max_key);
-    fprintf(stdout, "Num times DB reopens      : %d\n", FLAGS_reopen);
-    fprintf(stdout, "Batches/snapshots         : %d\n",
-            FLAGS_test_batches_snapshots);
-    fprintf(stdout, "Do update in place        : %d\n", FLAGS_in_place_update);
-    fprintf(stdout, "Num keys per lock         : %d\n",
-            1 << FLAGS_log2_keys_per_lock);
-    std::string compression = CompressionTypeToString(FLAGS_compression_type_e);
-    fprintf(stdout, "Compression               : %s\n", compression.c_str());
-    std::string checksum = ChecksumTypeToString(FLAGS_checksum_type_e);
-    fprintf(stdout, "Checksum type             : %s\n", checksum.c_str());
-    fprintf(stdout, "Max subcompactions        : %" PRIu64 "\n",
-            FLAGS_subcompactions);
-
-    const char* memtablerep = "";
-    switch (FLAGS_rep_factory) {
-      case kSkipList:
-        memtablerep = "skip_list";
-        break;
-      case kHashSkipList:
-        memtablerep = "prefix_hash";
-        break;
-      case kVectorRep:
-        memtablerep = "vector";
-        break;
-    }
-
-    fprintf(stdout, "Memtablerep               : %s\n", memtablerep);
-
-    fprintf(stdout, "Test kill odd             : %d\n", rocksdb_kill_odds);
-    if (!rocksdb_kill_prefix_blacklist.empty()) {
-      fprintf(stdout, "Skipping kill points prefixes:\n");
-      for (auto& p : rocksdb_kill_prefix_blacklist) {
-        fprintf(stdout, "  %s\n", p.c_str());
-      }
-    }
-
-    fprintf(stdout, "------------------------------------------------\n");
-  }
-
-  void Open() {
-    assert(db_ == nullptr);
-#ifndef ROCKSDB_LITE
-    assert(txn_db_ == nullptr);
-#endif
-    if (FLAGS_options_file.empty()) {
-      BlockBasedTableOptions block_based_options;
-      block_based_options.block_cache = cache_;
-      block_based_options.block_cache_compressed = compressed_cache_;
-      block_based_options.checksum = FLAGS_checksum_type_e;
-      block_based_options.block_size = FLAGS_block_size;
-      block_based_options.format_version =
-          static_cast<uint32_t>(FLAGS_format_version);
-      block_based_options.index_block_restart_interval =
-          static_cast<int32_t>(FLAGS_index_block_restart_interval);
-      block_based_options.filter_policy = filter_policy_;
-      options_.table_factory.reset(
-          NewBlockBasedTableFactory(block_based_options));
-      options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
-      options_.write_buffer_size = FLAGS_write_buffer_size;
-      options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
-      options_.min_write_buffer_number_to_merge =
-          FLAGS_min_write_buffer_number_to_merge;
-      options_.max_write_buffer_number_to_maintain =
-          FLAGS_max_write_buffer_number_to_maintain;
-      options_.memtable_prefix_bloom_size_ratio =
-          FLAGS_memtable_prefix_bloom_size_ratio;
-      options_.memtable_whole_key_filtering =
-          FLAGS_memtable_whole_key_filtering;
-      options_.max_background_compactions = FLAGS_max_background_compactions;
-      options_.max_background_flushes = FLAGS_max_background_flushes;
-      options_.compaction_style =
-          static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
-      options_.prefix_extractor.reset(
-          NewFixedPrefixTransform(FLAGS_prefix_size));
-      options_.max_open_files = FLAGS_open_files;
-      options_.statistics = dbstats;
-      options_.env = FLAGS_env;
-      options_.use_fsync = FLAGS_use_fsync;
-      options_.compaction_readahead_size = FLAGS_compaction_readahead_size;
-      options_.allow_mmap_reads = FLAGS_mmap_read;
-      options_.allow_mmap_writes = FLAGS_mmap_write;
-      options_.use_direct_reads = FLAGS_use_direct_reads;
-      options_.use_direct_io_for_flush_and_compaction =
-          FLAGS_use_direct_io_for_flush_and_compaction;
-      options_.recycle_log_file_num =
-          static_cast<size_t>(FLAGS_recycle_log_file_num);
-      options_.target_file_size_base = FLAGS_target_file_size_base;
-      options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
-      options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
-      options_.max_bytes_for_level_multiplier =
-          FLAGS_max_bytes_for_level_multiplier;
-      options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
-      options_.level0_slowdown_writes_trigger =
-          FLAGS_level0_slowdown_writes_trigger;
-      options_.level0_file_num_compaction_trigger =
-          FLAGS_level0_file_num_compaction_trigger;
-      options_.compression = FLAGS_compression_type_e;
-      options_.compression_opts.max_dict_bytes =
-          FLAGS_compression_max_dict_bytes;
-      options_.compression_opts.zstd_max_train_bytes =
-          FLAGS_compression_zstd_max_train_bytes;
-      options_.create_if_missing = true;
-      options_.max_manifest_file_size = FLAGS_max_manifest_file_size;
-      options_.inplace_update_support = FLAGS_in_place_update;
-      options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
-      options_.allow_concurrent_memtable_write =
-          FLAGS_allow_concurrent_memtable_write;
-      options_.enable_pipelined_write = FLAGS_enable_pipelined_write;
-      options_.enable_write_thread_adaptive_yield =
-          FLAGS_enable_write_thread_adaptive_yield;
-      options_.compaction_options_universal.size_ratio =
-          FLAGS_universal_size_ratio;
-      options_.compaction_options_universal.min_merge_width =
-          FLAGS_universal_min_merge_width;
-      options_.compaction_options_universal.max_merge_width =
-          FLAGS_universal_max_merge_width;
-      options_.compaction_options_universal.max_size_amplification_percent =
-          FLAGS_universal_max_size_amplification_percent;
-      options_.atomic_flush = FLAGS_atomic_flush;
-    } else {
-#ifdef ROCKSDB_LITE
-      fprintf(stderr, "--options_file not supported in lite mode\n");
-      exit(1);
-#else
-      DBOptions db_options;
-      std::vector<ColumnFamilyDescriptor> cf_descriptors;
-      Status s = LoadOptionsFromFile(FLAGS_options_file, Env::Default(),
-                                     &db_options, &cf_descriptors);
-      if (!s.ok()) {
-        fprintf(stderr, "Unable to load options file %s --- %s\n",
-                FLAGS_options_file.c_str(), s.ToString().c_str());
-        exit(1);
-      }
-      options_ = Options(db_options, cf_descriptors[0].options);
-#endif  // ROCKSDB_LITE
-    }
-
-    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
-      options_.rate_limiter.reset(NewGenericRateLimiter(
-          FLAGS_rate_limiter_bytes_per_sec, 1000 /* refill_period_us */,
-          10 /* fairness */,
-          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
-                                    : RateLimiter::Mode::kWritesOnly));
-      if (FLAGS_rate_limit_bg_reads) {
-        options_.new_table_reader_for_compaction_inputs = true;
-      }
-    }
-
-    if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) {
-      fprintf(stderr,
-              "prefeix_size cannot be zero if memtablerep == prefix_hash\n");
-      exit(1);
-    }
-    if (FLAGS_prefix_size != 0 && FLAGS_rep_factory != kHashSkipList) {
-      fprintf(stderr,
-              "WARNING: prefix_size is non-zero but "
-              "memtablerep != prefix_hash\n");
-    }
-    switch (FLAGS_rep_factory) {
-      case kSkipList:
-        // no need to do anything
-        break;
-#ifndef ROCKSDB_LITE
-      case kHashSkipList:
-        options_.memtable_factory.reset(NewHashSkipListRepFactory(10000));
-        break;
-      case kVectorRep:
-        options_.memtable_factory.reset(new VectorRepFactory());
-        break;
-#else
-      default:
-        fprintf(stderr,
-                "RocksdbLite only supports skip list mem table. Skip "
-                "--rep_factory\n");
-#endif  // ROCKSDB_LITE
-    }
-
-    if (FLAGS_use_full_merge_v1) {
-      options_.merge_operator = MergeOperators::CreateDeprecatedPutOperator();
-    } else {
-      options_.merge_operator = MergeOperators::CreatePutOperator();
-    }
-
-    fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
-
-    Status s;
-    if (FLAGS_ttl == -1) {
-      std::vector<std::string> existing_column_families;
-      s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
-                                 &existing_column_families);  // ignore errors
-      if (!s.ok()) {
-        // DB doesn't exist
-        assert(existing_column_families.empty());
-        assert(column_family_names_.empty());
-        column_family_names_.push_back(kDefaultColumnFamilyName);
-      } else if (column_family_names_.empty()) {
-        // this is the first call to the function Open()
-        column_family_names_ = existing_column_families;
-      } else {
-        // this is a reopen. just assert that existing column_family_names are
-        // equivalent to what we remember
-        auto sorted_cfn = column_family_names_;
-        std::sort(sorted_cfn.begin(), sorted_cfn.end());
-        std::sort(existing_column_families.begin(),
-                  existing_column_families.end());
-        if (sorted_cfn != existing_column_families) {
-          fprintf(stderr,
-                  "Expected column families differ from the existing:\n");
-          printf("Expected: {");
-          for (auto cf : sorted_cfn) {
-            printf("%s ", cf.c_str());
-          }
-          printf("}\n");
-          printf("Existing: {");
-          for (auto cf : existing_column_families) {
-            printf("%s ", cf.c_str());
-          }
-          printf("}\n");
-        }
-        assert(sorted_cfn == existing_column_families);
-      }
-      std::vector<ColumnFamilyDescriptor> cf_descriptors;
-      for (auto name : column_family_names_) {
-        if (name != kDefaultColumnFamilyName) {
-          new_column_family_name_ =
-              std::max(new_column_family_name_.load(), std::stoi(name) + 1);
-        }
-        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
-      }
-      while (cf_descriptors.size() < (size_t)FLAGS_column_families) {
-        std::string name = ToString(new_column_family_name_.load());
-        new_column_family_name_++;
-        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
-        column_family_names_.push_back(name);
-      }
-      options_.listeners.clear();
-      options_.listeners.emplace_back(
-          new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors));
-      options_.create_missing_column_families = true;
-      if (!FLAGS_use_txn) {
-        if (db_preload_finished_.load() && FLAGS_read_only) {
-          s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors,
-                                  &column_families_, &db_);
-        } else {
-          s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
-                       &column_families_, &db_);
-        }
-      } else {
-#ifndef ROCKSDB_LITE
-        TransactionDBOptions txn_db_options;
-        // For the moment it is sufficient to test WRITE_PREPARED policy
-        txn_db_options.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
-        s = TransactionDB::Open(options_, txn_db_options, FLAGS_db,
-                                cf_descriptors, &column_families_, &txn_db_);
-        db_ = txn_db_;
-        // after a crash, rollback to commit recovered transactions
-        std::vector<Transaction*> trans;
-        txn_db_->GetAllPreparedTransactions(&trans);
-        Random rand(static_cast<uint32_t>(FLAGS_seed));
-        for (auto txn : trans) {
-          if (rand.OneIn(2)) {
-            s = txn->Commit();
-            assert(s.ok());
-          } else {
-            s = txn->Rollback();
-            assert(s.ok());
-          }
-          delete txn;
-        }
-        trans.clear();
-        txn_db_->GetAllPreparedTransactions(&trans);
-        assert(trans.size() == 0);
-#endif
-      }
-      assert(!s.ok() || column_families_.size() ==
-                            static_cast<size_t>(FLAGS_column_families));
-    } else {
-#ifndef ROCKSDB_LITE
-      DBWithTTL* db_with_ttl;
-      s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
-      db_ = db_with_ttl;
-#else
-      fprintf(stderr, "TTL is not supported in RocksDBLite\n");
-      exit(1);
-#endif
-    }
-    if (!s.ok()) {
-      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-      exit(1);
-    }
-  }
-
-  void Reopen() {
-    for (auto cf : column_families_) {
-      delete cf;
-    }
-    column_families_.clear();
-    delete db_;
-    db_ = nullptr;
-#ifndef ROCKSDB_LITE
-    txn_db_ = nullptr;
-#endif
-
-    num_times_reopened_++;
-    auto now = FLAGS_env->NowMicros();
-    fprintf(stdout, "%s Reopening database for the %dth time\n",
-            FLAGS_env->TimeToString(now/1000000).c_str(),
-            num_times_reopened_);
-    Open();
-  }
-
-  void PrintStatistics() {
-    if (dbstats) {
-      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
-    }
-  }
-
-  std::shared_ptr<Cache> cache_;
-  std::shared_ptr<Cache> compressed_cache_;
-  std::shared_ptr<const FilterPolicy> filter_policy_;
-  DB* db_;
-#ifndef ROCKSDB_LITE
-  TransactionDB* txn_db_;
-#endif
-  Options options_;
-  std::vector<ColumnFamilyHandle*> column_families_;
-  std::vector<std::string> column_family_names_;
-  std::atomic<int> new_column_family_name_;
-  int num_times_reopened_;
-  std::unordered_map<std::string, std::vector<std::string>> options_table_;
-  std::vector<std::string> options_index_;
-  std::atomic<bool> db_preload_finished_;
-};
-
-class NonBatchedOpsStressTest : public StressTest {
- public:
-  NonBatchedOpsStressTest() {}
-
-  virtual ~NonBatchedOpsStressTest() {}
-
-  virtual void VerifyDb(ThreadState* thread) const {
-    ReadOptions options(FLAGS_verify_checksum, true);
-    auto shared = thread->shared;
-    const int64_t max_key = shared->GetMaxKey();
-    const int64_t keys_per_thread = max_key / shared->GetNumThreads();
-    int64_t start = keys_per_thread * thread->tid;
-    int64_t end = start + keys_per_thread;
-    if (thread->tid == shared->GetNumThreads() - 1) {
-      end = max_key;
-    }
-    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
-      if (thread->shared->HasVerificationFailedYet()) {
-        break;
-      }
-      if (!thread->rand.OneIn(2)) {
-        // Use iterator to verify this range
-        std::unique_ptr<Iterator> iter(
-            db_->NewIterator(options, column_families_[cf]));
-        iter->Seek(Key(start));
-        for (auto i = start; i < end; i++) {
-          if (thread->shared->HasVerificationFailedYet()) {
-            break;
-          }
-          // TODO(ljin): update "long" to uint64_t
-          // Reseek when the prefix changes
-          if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
-              0) {
-            iter->Seek(Key(i));
-          }
-          std::string from_db;
-          std::string keystr = Key(i);
-          Slice k = keystr;
-          Status s = iter->status();
-          if (iter->Valid()) {
-            if (iter->key().compare(k) > 0) {
-              s = Status::NotFound(Slice());
-            } else if (iter->key().compare(k) == 0) {
-              from_db = iter->value().ToString();
-              iter->Next();
-            } else if (iter->key().compare(k) < 0) {
-              VerificationAbort(shared, "An out of range key was found",
-                                static_cast<int>(cf), i);
-            }
-          } else {
-            // The iterator found no value for the key in question, so do not
-            // move to the next item in the iterator
-            s = Status::NotFound(Slice());
-          }
-          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
-                      true);
-          if (from_db.length()) {
-            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
-                          from_db.data(), from_db.length());
-          }
-        }
-      } else {
-        // Use Get to verify this range
-        for (auto i = start; i < end; i++) {
-          if (thread->shared->HasVerificationFailedYet()) {
-            break;
-          }
-          std::string from_db;
-          std::string keystr = Key(i);
-          Slice k = keystr;
-          Status s = db_->Get(options, column_families_[cf], k, &from_db);
-          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
-                      true);
-          if (from_db.length()) {
-            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
-                          from_db.data(), from_db.length());
-          }
-        }
-      }
-    }
-  }
-
-  virtual void MaybeClearOneColumnFamily(ThreadState* thread) {
-    if (FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
-      if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
-        // drop column family and then create it again (can't drop default)
-        int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
-        std::string new_name =
-            ToString(new_column_family_name_.fetch_add(1));
-        {
-          MutexLock l(thread->shared->GetMutex());
-          fprintf(
-              stdout,
-              "[CF %d] Dropping and recreating column family. new name: %s\n",
-              cf, new_name.c_str());
-        }
-        thread->shared->LockColumnFamily(cf);
-        Status s = db_->DropColumnFamily(column_families_[cf]);
-        delete column_families_[cf];
-        if (!s.ok()) {
-          fprintf(stderr, "dropping column family error: %s\n",
-              s.ToString().c_str());
-          std::terminate();
-        }
-        s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
-                                    &column_families_[cf]);
-        column_family_names_[cf] = new_name;
-        thread->shared->ClearColumnFamily(cf);
-        if (!s.ok()) {
-          fprintf(stderr, "creating column family error: %s\n",
-              s.ToString().c_str());
-          std::terminate();
-        }
-        thread->shared->UnlockColumnFamily(cf);
-      }
-    }
-  }
-
-  virtual bool ShouldAcquireMutexOnKey() const { return true; }
-
-  virtual Status TestGet(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    std::string from_db;
-    Status s = db_->Get(read_opts, cfh, key, &from_db);
-    if (s.ok()) {
-      // found case
-      thread->stats.AddGets(1, 1);
-    } else if (s.IsNotFound()) {
-      // not found case
-      thread->stats.AddGets(1, 0);
-    } else {
-      // errors case
-      thread->stats.AddErrors(1);
-    }
-    return s;
-  }
-
-  virtual Status TestPrefixScan(ThreadState* thread,
-      const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    Slice prefix = Slice(key.data(), FLAGS_prefix_size);
-
-    std::string upper_bound;
-    Slice ub_slice;
-    ReadOptions ro_copy = read_opts;
-    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
-      // For half of the time, set the upper bound to the next prefix
-      ub_slice = Slice(upper_bound);
-      ro_copy.iterate_upper_bound = &ub_slice;
-    }
-
-    Iterator* iter = db_->NewIterator(ro_copy, cfh);
-    long count = 0;
-    for (iter->Seek(prefix);
-        iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
-      ++count;
-    }
-    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
-    Status s = iter->status();
-    if (iter->status().ok()) {
-      thread->stats.AddPrefixes(1, count);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-    delete iter;
-    return s;
-  }
-
-  virtual Status TestPut(ThreadState* thread,
-      WriteOptions& write_opts, const ReadOptions& read_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      char (&value) [100], std::unique_ptr<MutexLock>& lock) {
-    auto shared = thread->shared;
-    int64_t max_key = shared->GetMaxKey();
-    int64_t rand_key = rand_keys[0];
-    int rand_column_family = rand_column_families[0];
-    while (!shared->AllowsOverwrite(rand_key) &&
-           (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
-      lock.reset();
-      rand_key = thread->rand.Next() % max_key;
-      rand_column_family = thread->rand.Next() % FLAGS_column_families;
-      lock.reset(new MutexLock(
-          shared->GetMutexForKey(rand_column_family, rand_key)));
-    }
-
-    std::string key_str = Key(rand_key);
-    Slice key = key_str;
-    ColumnFamilyHandle* cfh = column_families_[rand_column_family];
-
-    if (FLAGS_verify_before_write) {
-      std::string key_str2 = Key(rand_key);
-      Slice k = key_str2;
-      std::string from_db;
-      Status s = db_->Get(read_opts, cfh, k, &from_db);
-      if (!VerifyValue(rand_column_family, rand_key, read_opts, shared,
-            from_db, s, true)) {
-        return s;
-      }
-    }
-    uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
-    size_t sz = GenerateValue(value_base, value, sizeof(value));
-    Slice v(value, sz);
-    shared->Put(rand_column_family, rand_key, value_base, true /* pending */);
-    Status s;
-    if (FLAGS_use_merge) {
-      if (!FLAGS_use_txn) {
-        s = db_->Merge(write_opts, cfh, key, v);
-      } else {
-#ifndef ROCKSDB_LITE
-        Transaction* txn;
-        s = NewTxn(write_opts, &txn);
-        if (s.ok()) {
-          s = txn->Merge(cfh, key, v);
-          if (s.ok()) {
-            s = CommitTxn(txn);
-          }
-        }
-#endif
-      }
-    } else {
-      if (!FLAGS_use_txn) {
-        s = db_->Put(write_opts, cfh, key, v);
-      } else {
-#ifndef ROCKSDB_LITE
-        Transaction* txn;
-        s = NewTxn(write_opts, &txn);
-        if (s.ok()) {
-          s = txn->Put(cfh, key, v);
-          if (s.ok()) {
-            s = CommitTxn(txn);
-          }
-        }
-#endif
-      }
-    }
-    shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
-    if (!s.ok()) {
-      fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
-      std::terminate();
-    }
-    thread->stats.AddBytesForWrites(1, sz);
-    PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key),
-        value, sz);
-    return s;
-  }
-
-  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) {
-    int64_t rand_key = rand_keys[0];
-    int rand_column_family = rand_column_families[0];
-    auto shared = thread->shared;
-    int64_t max_key = shared->GetMaxKey();
-
-    // OPERATION delete
-    // If the chosen key does not allow overwrite and it does not exist,
-    // choose another key.
-    while (!shared->AllowsOverwrite(rand_key) &&
-           !shared->Exists(rand_column_family, rand_key)) {
-      lock.reset();
-      rand_key = thread->rand.Next() % max_key;
-      rand_column_family = thread->rand.Next() % FLAGS_column_families;
-      lock.reset(new MutexLock(
-          shared->GetMutexForKey(rand_column_family, rand_key)));
-    }
-
-    std::string key_str = Key(rand_key);
-    Slice key = key_str;
-    auto cfh = column_families_[rand_column_family];
-
-    // Use delete if the key may be overwritten and a single deletion
-    // otherwise.
-    Status s;
-    if (shared->AllowsOverwrite(rand_key)) {
-      shared->Delete(rand_column_family, rand_key, true /* pending */);
-      if (!FLAGS_use_txn) {
-        s = db_->Delete(write_opts, cfh, key);
-      } else {
-#ifndef ROCKSDB_LITE
-        Transaction* txn;
-        s = NewTxn(write_opts, &txn);
-        if (s.ok()) {
-          s = txn->Delete(cfh, key);
-          if (s.ok()) {
-            s = CommitTxn(txn);
-          }
-        }
-#endif
-      }
-      shared->Delete(rand_column_family, rand_key, false /* pending */);
-      thread->stats.AddDeletes(1);
-      if (!s.ok()) {
-        fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
-        std::terminate();
-      }
-    } else {
-      shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
-      if (!FLAGS_use_txn) {
-        s = db_->SingleDelete(write_opts, cfh, key);
-      } else {
-#ifndef ROCKSDB_LITE
-        Transaction* txn;
-        s = NewTxn(write_opts, &txn);
-        if (s.ok()) {
-          s = txn->SingleDelete(cfh, key);
-          if (s.ok()) {
-            s = CommitTxn(txn);
-          }
-        }
-#endif
-      }
-      shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
-      thread->stats.AddSingleDeletes(1);
-      if (!s.ok()) {
-        fprintf(stderr, "single delete error: %s\n",
-                s.ToString().c_str());
-        std::terminate();
-      }
-    }
-    return s;
-  }
-
-  virtual Status TestDeleteRange(ThreadState* thread,
-      WriteOptions& write_opts,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& lock) {
-    // OPERATION delete range
-    std::vector<std::unique_ptr<MutexLock>> range_locks;
-    // delete range does not respect disallowed overwrites. the keys for
-    // which overwrites are disallowed are randomly distributed so it
-    // could be expensive to find a range where each key allows
-    // overwrites.
-    int64_t rand_key = rand_keys[0];
-    int rand_column_family = rand_column_families[0];
-    auto shared = thread->shared;
-    int64_t max_key = shared->GetMaxKey();
-    if (rand_key > max_key - FLAGS_range_deletion_width) {
-      lock.reset();
-      rand_key = thread->rand.Next() %
-                 (max_key - FLAGS_range_deletion_width + 1);
-      range_locks.emplace_back(new MutexLock(
-          shared->GetMutexForKey(rand_column_family, rand_key)));
-    } else {
-      range_locks.emplace_back(std::move(lock));
-    }
-    for (int j = 1; j < FLAGS_range_deletion_width; ++j) {
-      if (((rand_key + j) & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
-        range_locks.emplace_back(new MutexLock(
-              shared->GetMutexForKey(rand_column_family, rand_key + j)));
-      }
-    }
-    shared->DeleteRange(rand_column_family, rand_key,
-                        rand_key + FLAGS_range_deletion_width,
-                        true /* pending */);
-
-    std::string keystr = Key(rand_key);
-    Slice key = keystr;
-    auto cfh = column_families_[rand_column_family];
-    std::string end_keystr = Key(rand_key + FLAGS_range_deletion_width);
-    Slice end_key = end_keystr;
-    Status s = db_->DeleteRange(write_opts, cfh, key, end_key);
-    if (!s.ok()) {
-      fprintf(stderr, "delete range error: %s\n",
-              s.ToString().c_str());
-      std::terminate();
-    }
-    int covered = shared->DeleteRange(
-        rand_column_family, rand_key,
-        rand_key + FLAGS_range_deletion_width, false /* pending */);
-    thread->stats.AddRangeDeletions(1);
-    thread->stats.AddCoveredByRangeDeletions(covered);
-    return s;
-  }
-
-#ifdef ROCKSDB_LITE
-  virtual void TestIngestExternalFile(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    assert(false);
-    fprintf(stderr,
-            "RocksDB lite does not support "
-            "TestIngestExternalFile\n");
-    std::terminate();
-  }
-#else
-  virtual void TestIngestExternalFile(
-      ThreadState* thread, const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys, std::unique_ptr<MutexLock>& lock) {
-    const std::string sst_filename =
-        FLAGS_db + "/." + ToString(thread->tid) + ".sst";
-    Status s;
-    if (FLAGS_env->FileExists(sst_filename).ok()) {
-      // Maybe we terminated abnormally before, so cleanup to give this file
-      // ingestion a clean slate
-      s = FLAGS_env->DeleteFile(sst_filename);
-    }
-
-    SstFileWriter sst_file_writer(EnvOptions(), options_);
-    if (s.ok()) {
-      s = sst_file_writer.Open(sst_filename);
-    }
-    int64_t key_base = rand_keys[0];
-    int column_family = rand_column_families[0];
-    std::vector<std::unique_ptr<MutexLock> > range_locks;
-    std::vector<uint32_t> values;
-    SharedState* shared = thread->shared;
-
-    // Grab locks, set pending state on expected values, and add keys
-    for (int64_t key = key_base;
-         s.ok() && key < std::min(key_base + FLAGS_ingest_external_file_width,
-                                  shared->GetMaxKey());
-         ++key) {
-      if (key == key_base) {
-        range_locks.emplace_back(std::move(lock));
-      } else if ((key & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
-        range_locks.emplace_back(
-            new MutexLock(shared->GetMutexForKey(column_family, key)));
-      }
-
-      uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
-      values.push_back(value_base);
-      shared->Put(column_family, key, value_base, true /* pending */);
-
-      char value[100];
-      size_t value_len = GenerateValue(value_base, value, sizeof(value));
-      auto key_str = Key(key);
-      s = sst_file_writer.Put(Slice(key_str), Slice(value, value_len));
-    }
-
-    if (s.ok()) {
-      s = sst_file_writer.Finish();
-    }
-    if (s.ok()) {
-      s = db_->IngestExternalFile(column_families_[column_family],
-                                  {sst_filename}, IngestExternalFileOptions());
-    }
-    if (!s.ok()) {
-      fprintf(stderr, "file ingestion error: %s\n", s.ToString().c_str());
-      std::terminate();
-    }
-    int64_t key = key_base;
-    for (int32_t value : values) {
-      shared->Put(column_family, key, value, false /* pending */);
-      ++key;
-    }
-  }
-#endif  // ROCKSDB_LITE
-
-  bool VerifyValue(int cf, int64_t key, const ReadOptions& /*opts*/,
-                   SharedState* shared, const std::string& value_from_db,
-                   Status s, bool strict = false) const {
-    if (shared->HasVerificationFailedYet()) {
-      return false;
-    }
-    // compare value_from_db with the value in the shared state
-    char value[kValueMaxLen];
-    uint32_t value_base = shared->Get(cf, key);
-    if (value_base == SharedState::UNKNOWN_SENTINEL) {
-      return true;
-    }
-    if (value_base == SharedState::DELETION_SENTINEL && !strict) {
-      return true;
-    }
-
-    if (s.ok()) {
-      if (value_base == SharedState::DELETION_SENTINEL) {
-        VerificationAbort(shared, "Unexpected value found", cf, key);
-        return false;
-      }
-      size_t sz = GenerateValue(value_base, value, sizeof(value));
-      if (value_from_db.length() != sz) {
-        VerificationAbort(shared, "Length of value read is not equal", cf, key);
-        return false;
-      }
-      if (memcmp(value_from_db.data(), value, sz) != 0) {
-        VerificationAbort(shared, "Contents of value read don't match", cf,
-                          key);
-        return false;
-      }
-    } else {
-      if (value_base != SharedState::DELETION_SENTINEL) {
-        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
-        return false;
-      }
-    }
-    return true;
-  }
-};
-
-class BatchedOpsStressTest : public StressTest {
- public:
-  BatchedOpsStressTest() {}
-  virtual ~BatchedOpsStressTest() {}
-
-  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
-  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
-  // Also refer BatchedOpsStressTest::TestGet
-  virtual Status TestPut(ThreadState* thread,
-      WriteOptions& write_opts, const ReadOptions& /* read_opts */,
-      const std::vector<int>& rand_column_families, const std::vector<int64_t>& rand_keys,
-      char (&value)[100], std::unique_ptr<MutexLock>& /* lock */) {
-    uint32_t value_base =
-        thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL;
-    size_t sz = GenerateValue(value_base, value, sizeof(value));
-    Slice v(value, sz);
-    std::string keys[10] = {"9", "8", "7", "6", "5",
-                            "4", "3", "2", "1", "0"};
-    std::string values[10] = {"9", "8", "7", "6", "5",
-                              "4", "3", "2", "1", "0"};
-    Slice value_slices[10];
-    WriteBatch batch;
-    Status s;
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string key_str = Key(rand_keys[0]);
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key_str;
-      values[i] += v.ToString();
-      value_slices[i] = values[i];
-      if (FLAGS_use_merge) {
-        batch.Merge(cfh, keys[i], value_slices[i]);
-      } else {
-        batch.Put(cfh, keys[i], value_slices[i]);
-      }
-    }
-
-    s = db_->Write(write_opts, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      // we did 10 writes each of size sz + 1
-      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
-    }
-
-    return s;
-  }
-
-  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
-  // in DB atomically i.e in a single batch. Also refer MultiGet.
-  virtual Status TestDelete(ThreadState* thread, WriteOptions& writeoptions,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    std::string keys[10] = {"9", "7", "5", "3", "1",
-                            "8", "6", "4", "2", "0"};
-
-    WriteBatch batch;
-    Status s;
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string key_str = Key(rand_keys[0]);
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key_str;
-      batch.Delete(cfh, keys[i]);
-    }
-
-    s = db_->Write(writeoptions, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      thread->stats.AddDeletes(10);
-    }
-
-    return s;
-  }
-
-  virtual Status TestDeleteRange(ThreadState* /* thread */,
-      WriteOptions& /* write_opts */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    assert(false);
-    return Status::NotSupported("BatchedOpsStressTest does not support "
-        "TestDeleteRange");
-  }
-
-  virtual void TestIngestExternalFile(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    assert(false);
-    fprintf(stderr,
-            "BatchedOpsStressTest does not support "
-            "TestIngestExternalFile\n");
-    std::terminate();
-  }
-
-  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
-  // in the same snapshot, and verifies that all the values are of the form
-  // "0"+V, "1"+V,..."9"+V.
-  // ASSUMES that BatchedOpsStressTest::TestPut was used to put (K, V) into
-  // the DB.
-  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
-    Slice key_slices[10];
-    std::string values[10];
-    ReadOptions readoptionscopy = readoptions;
-    readoptionscopy.snapshot = db_->GetSnapshot();
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string from_db;
-    Status s;
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key.ToString();
-      key_slices[i] = keys[i];
-      s = db_->Get(readoptionscopy, cfh, key_slices[i], &from_db);
-      if (!s.ok() && !s.IsNotFound()) {
-        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
-        values[i] = "";
-        thread->stats.AddErrors(1);
-        // we continue after error rather than exiting so that we can
-        // find more errors if any
-      } else if (s.IsNotFound()) {
-        values[i] = "";
-        thread->stats.AddGets(1, 0);
-      } else {
-        values[i] = from_db;
-
-        char expected_prefix = (keys[i])[0];
-        char actual_prefix = (values[i])[0];
-        if (actual_prefix != expected_prefix) {
-          fprintf(stderr, "error expected prefix = %c actual = %c\n",
-                  expected_prefix, actual_prefix);
-        }
-        (values[i])[0] = ' '; // blank out the differing character
-        thread->stats.AddGets(1, 1);
-      }
-    }
-    db_->ReleaseSnapshot(readoptionscopy.snapshot);
-
-    // Now that we retrieved all values, check that they all match
-    for (int i = 1; i < 10; i++) {
-      if (values[i] != values[0]) {
-        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
-                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
-                StringToHex(values[i]).c_str());
-      // we continue after error rather than exiting so that we can
-      // find more errors if any
-      }
-    }
-
-    return s;
-  }
-
-  // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
-  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
-  // of the key. Each of these 10 scans returns a series of values;
-  // each series should be the same length, and it is verified for each
-  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
-  // ASSUMES that MultiPut was used to put (K, V)
-  virtual Status TestPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
-      const std::vector<int>& rand_column_families,
-      const std::vector<int64_t>& rand_keys) {
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    auto cfh = column_families_[rand_column_families[0]];
-    std::string prefixes[10] = {"0", "1", "2", "3", "4",
-                                "5", "6", "7", "8", "9"};
-    Slice prefix_slices[10];
-    ReadOptions readoptionscopy[10];
-    const Snapshot* snapshot = db_->GetSnapshot();
-    Iterator* iters[10];
-    std::string upper_bounds[10];
-    Slice ub_slices[10];
-    Status s = Status::OK();
-    for (int i = 0; i < 10; i++) {
-      prefixes[i] += key.ToString();
-      prefixes[i].resize(FLAGS_prefix_size);
-      prefix_slices[i] = Slice(prefixes[i]);
-      readoptionscopy[i] = readoptions;
-      readoptionscopy[i].snapshot = snapshot;
-      if (thread->rand.OneIn(2) &&
-          GetNextPrefix(prefix_slices[i], &(upper_bounds[i]))) {
-        // For half of the time, set the upper bound to the next prefix
-        ub_slices[i] = Slice(upper_bounds[i]);
-        readoptionscopy[i].iterate_upper_bound = &(ub_slices[i]);
-      }
-      iters[i] = db_->NewIterator(readoptionscopy[i], cfh);
-      iters[i]->Seek(prefix_slices[i]);
-    }
-
-    long count = 0;
-    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
-      count++;
-      std::string values[10];
-      // get list of all values for this iteration
-      for (int i = 0; i < 10; i++) {
-        // no iterator should finish before the first one
-        assert(iters[i]->Valid() &&
-               iters[i]->key().starts_with(prefix_slices[i]));
-        values[i] = iters[i]->value().ToString();
-
-        char expected_first = (prefixes[i])[0];
-        char actual_first = (values[i])[0];
-
-        if (actual_first != expected_first) {
-          fprintf(stderr, "error expected first = %c actual = %c\n",
-                  expected_first, actual_first);
-        }
-        (values[i])[0] = ' '; // blank out the differing character
-      }
-      // make sure all values are equivalent
-      for (int i = 0; i < 10; i++) {
-        if (values[i] != values[0]) {
-          fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n",
-                  i, prefixes[i].c_str(), StringToHex(values[0]).c_str(),
-                  StringToHex(values[i]).c_str());
-          // we continue after error rather than exiting so that we can
-          // find more errors if any
-        }
-        iters[i]->Next();
-      }
-    }
-
-    // cleanup iterators and snapshot
-    for (int i = 0; i < 10; i++) {
-      // if the first iterator finished, they should have all finished
-      assert(!iters[i]->Valid() ||
-             !iters[i]->key().starts_with(prefix_slices[i]));
-      assert(iters[i]->status().ok());
-      delete iters[i];
-    }
-    db_->ReleaseSnapshot(snapshot);
-
-    if (s.ok()) {
-      thread->stats.AddPrefixes(1, count);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-
-    return s;
-  }
-
-  virtual void VerifyDb(ThreadState* /* thread */) const {}
-};
-
-class AtomicFlushStressTest : public StressTest {
- public:
-  AtomicFlushStressTest() : batch_id_(0) {}
-
-  virtual ~AtomicFlushStressTest() {}
-
-  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
-                         const ReadOptions& /* read_opts */,
-                         const std::vector<int>& rand_column_families,
-                         const std::vector<int64_t>& rand_keys,
-                         char (&value)[100],
-                         std::unique_ptr<MutexLock>& /* lock */) {
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    uint64_t value_base = batch_id_.fetch_add(1);
-    size_t sz =
-        GenerateValue(static_cast<uint32_t>(value_base), value, sizeof(value));
-    Slice v(value, sz);
-    WriteBatch batch;
-    for (auto cf : rand_column_families) {
-      ColumnFamilyHandle* cfh = column_families_[cf];
-      if (FLAGS_use_merge) {
-        batch.Merge(cfh, key, v);
-      } else { /* !FLAGS_use_merge */
-        batch.Put(cfh, key, v);
-      }
-    }
-    Status s = db_->Write(write_opts, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multi put or merge error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      auto num = static_cast<long>(rand_column_families.size());
-      thread->stats.AddBytesForWrites(num, (sz + 1) * num);
-    }
-
-    return s;
-  }
-
-  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
-                            const std::vector<int>& rand_column_families,
-                            const std::vector<int64_t>& rand_keys,
-                            std::unique_ptr<MutexLock>& /* lock */) {
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    WriteBatch batch;
-    for (auto cf : rand_column_families) {
-      ColumnFamilyHandle* cfh = column_families_[cf];
-      batch.Delete(cfh, key);
-    }
-    Status s = db_->Write(write_opts, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multidel error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      thread->stats.AddDeletes(static_cast<long>(rand_column_families.size()));
-    }
-    return s;
-  }
-
-  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
-                                 const std::vector<int>& rand_column_families,
-                                 const std::vector<int64_t>& rand_keys,
-                                 std::unique_ptr<MutexLock>& /* lock */) {
-    int64_t rand_key = rand_keys[0];
-    auto shared = thread->shared;
-    int64_t max_key = shared->GetMaxKey();
-    if (rand_key > max_key - FLAGS_range_deletion_width) {
-      rand_key =
-          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
-    }
-    std::string key_str = Key(rand_key);
-    Slice key = key_str;
-    std::string end_key_str = Key(rand_key + FLAGS_range_deletion_width);
-    Slice end_key = end_key_str;
-    WriteBatch batch;
-    for (auto cf : rand_column_families) {
-      ColumnFamilyHandle* cfh = column_families_[rand_column_families[cf]];
-      batch.DeleteRange(cfh, key, end_key);
-    }
-    Status s = db_->Write(write_opts, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multi del range error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      thread->stats.AddRangeDeletions(
-          static_cast<long>(rand_column_families.size()));
-    }
-    return s;
-  }
-
-  virtual void TestIngestExternalFile(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */,
-      std::unique_ptr<MutexLock>& /* lock */) {
-    assert(false);
-    fprintf(stderr,
-            "AtomicFlushStressTest does not support TestIngestExternalFile "
-            "because it's not possible to verify the result\n");
-    std::terminate();
-  }
-
-  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
-                         const std::vector<int>& rand_column_families,
-                         const std::vector<int64_t>& rand_keys) {
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    auto cfh =
-        column_families_[rand_column_families[thread->rand.Next() %
-                                              rand_column_families.size()]];
-    std::string from_db;
-    Status s = db_->Get(readoptions, cfh, key, &from_db);
-    if (s.ok()) {
-      thread->stats.AddGets(1, 1);
-    } else if (s.IsNotFound()) {
-      thread->stats.AddGets(1, 0);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-    return s;
-  }
-
-  virtual Status TestPrefixScan(ThreadState* thread,
-                                const ReadOptions& readoptions,
-                                const std::vector<int>& rand_column_families,
-                                const std::vector<int64_t>& rand_keys) {
-    std::string key_str = Key(rand_keys[0]);
-    Slice key = key_str;
-    Slice prefix = Slice(key.data(), FLAGS_prefix_size);
-
-    std::string upper_bound;
-    Slice ub_slice;
-    ReadOptions ro_copy = readoptions;
-    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
-      ub_slice = Slice(upper_bound);
-      ro_copy.iterate_upper_bound = &ub_slice;
-    }
-    auto cfh =
-        column_families_[rand_column_families[thread->rand.Next() %
-                                              rand_column_families.size()]];
-    Iterator* iter = db_->NewIterator(ro_copy, cfh);
-    long count = 0;
-    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
-         iter->Next()) {
-      ++count;
-    }
-    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
-    Status s = iter->status();
-    if (s.ok()) {
-      thread->stats.AddPrefixes(1, count);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-    delete iter;
-    return s;
-  }
-
-#ifdef ROCKSDB_LITE
-  virtual Status TestCheckpoint(
-      ThreadState* /* thread */,
-      const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */) {
-    assert(false);
-    fprintf(stderr,
-            "RocksDB lite does not support "
-            "TestCheckpoint\n");
-    std::terminate();
-  }
-#else
-  virtual Status TestCheckpoint(
-      ThreadState* thread, const std::vector<int>& /* rand_column_families */,
-      const std::vector<int64_t>& /* rand_keys */) {
-    std::string checkpoint_dir =
-        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
-    DestroyDB(checkpoint_dir, Options());
-    Checkpoint* checkpoint = nullptr;
-    Status s = Checkpoint::Create(db_, &checkpoint);
-    if (s.ok()) {
-      s = checkpoint->CreateCheckpoint(checkpoint_dir);
-    }
-    std::vector<ColumnFamilyHandle*> cf_handles;
-    DB* checkpoint_db = nullptr;
-    if (s.ok()) {
-      delete checkpoint;
-      checkpoint = nullptr;
-      Options options(options_);
-      options.listeners.clear();
-      std::vector<ColumnFamilyDescriptor> cf_descs;
-      // TODO(ajkr): `column_family_names_` is not safe to access here when
-      // `clear_column_family_one_in != 0`. But we can't easily switch to
-      // `ListColumnFamilies` to get names because it won't necessarily give
-      // the same order as `column_family_names_`.
-      if (FLAGS_clear_column_family_one_in == 0) {
-        for (const auto& name : column_family_names_) {
-          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
-        }
-        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
-                                &cf_handles, &checkpoint_db);
-      }
-    }
-    if (checkpoint_db != nullptr) {
-      for (auto cfh : cf_handles) {
-        delete cfh;
-      }
-      cf_handles.clear();
-      delete checkpoint_db;
-      checkpoint_db = nullptr;
-    }
-    DestroyDB(checkpoint_dir, Options());
-    if (!s.ok()) {
-      fprintf(stderr, "A checkpoint operation failed with: %s\n",
-              s.ToString().c_str());
-    }
-    return s;
-  }
-#endif  // !ROCKSDB_LITE
-
-  virtual void VerifyDb(ThreadState* thread) const {
-    ReadOptions options(FLAGS_verify_checksum, true);
-    // We must set total_order_seek to true because we are doing a SeekToFirst
-    // on a column family whose memtables may support (by default) prefix-based
-    // iterator. In this case, NewIterator with options.total_order_seek being
-    // false returns a prefix-based iterator. Calling SeekToFirst using this
-    // iterator causes the iterator to become invalid. That means we cannot
-    // iterate the memtable using this iterator any more, although the memtable
-    // contains the most up-to-date key-values.
-    options.total_order_seek = true;
-    assert(thread != nullptr);
-    auto shared = thread->shared;
-    std::vector<std::unique_ptr<Iterator> > iters(column_families_.size());
-    for (size_t i = 0; i != column_families_.size(); ++i) {
-      iters[i].reset(db_->NewIterator(options, column_families_[i]));
-    }
-    for (auto& iter : iters) {
-      iter->SeekToFirst();
-    }
-    size_t num = column_families_.size();
-    assert(num == iters.size());
-    std::vector<Status> statuses(num, Status::OK());
-    do {
-      size_t valid_cnt = 0;
-      size_t idx = 0;
-      for (auto& iter : iters) {
-        if (iter->Valid()) {
-          ++valid_cnt;
-        } else {
-          statuses[idx] = iter->status();
-        }
-        ++idx;
-      }
-      if (valid_cnt == 0) {
-        Status status;
-        for (size_t i = 0; i != num; ++i) {
-          const auto& s = statuses[i];
-          if (!s.ok()) {
-            status = s;
-            fprintf(stderr, "Iterator on cf %s has error: %s\n",
-                    column_families_[i]->GetName().c_str(),
-                    s.ToString().c_str());
-            shared->SetVerificationFailure();
-          }
-        }
-        if (status.ok()) {
-          fprintf(stdout, "Finished scanning all column families.\n");
-        }
-        break;
-      } else if (valid_cnt != iters.size()) {
-        for (size_t i = 0; i != num; ++i) {
-          if (!iters[i]->Valid()) {
-            if (statuses[i].ok()) {
-              fprintf(stderr, "Finished scanning cf %s\n",
-                      column_families_[i]->GetName().c_str());
-            } else {
-              fprintf(stderr, "Iterator on cf %s has error: %s\n",
-                      column_families_[i]->GetName().c_str(),
-                      statuses[i].ToString().c_str());
-            }
-          } else {
-            fprintf(stderr, "cf %s has remaining data to scan\n",
-                    column_families_[i]->GetName().c_str());
-          }
-        }
-        shared->SetVerificationFailure();
-        break;
-      }
-      // If the program reaches here, then all column families' iterators are
-      // still valid.
-      Slice key;
-      Slice value;
-      for (size_t i = 0; i != num; ++i) {
-        if (i == 0) {
-          key = iters[i]->key();
-          value = iters[i]->value();
-        } else {
-          if (key.compare(iters[i]->key()) != 0) {
-            fprintf(stderr, "Verification failed\n");
-            fprintf(stderr, "cf%s: %s => %s\n",
-                    column_families_[0]->GetName().c_str(),
-                    key.ToString(true /* hex */).c_str(),
-                    value.ToString(/* hex */).c_str());
-            fprintf(stderr, "cf%s: %s => %s\n",
-                    column_families_[i]->GetName().c_str(),
-                    iters[i]->key().ToString(true /* hex */).c_str(),
-                    iters[i]->value().ToString(true /* hex */).c_str());
-            shared->SetVerificationFailure();
-          }
-        }
-      }
-      for (auto& iter : iters) {
-        iter->Next();
-      }
-    } while (true);
-  }
-
-  virtual std::vector<int> GenerateColumnFamilies(
-      const int /* num_column_families */, int /* rand_column_family */) const {
-    std::vector<int> ret;
-    int num = static_cast<int>(column_families_.size());
-    int k = 0;
-    std::generate_n(back_inserter(ret), num, [&k]() -> int { return k++; });
-    return ret;
-  }
-
- private:
-  std::atomic<int64_t> batch_id_;
-};
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                  " [OPTIONS]...");
-  ParseCommandLineFlags(&argc, &argv, true);
-
-  if (FLAGS_statistics) {
-    dbstats = rocksdb::CreateDBStatistics();
-  }
-  FLAGS_compression_type_e =
-    StringToCompressionType(FLAGS_compression_type.c_str());
-  FLAGS_checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
-  if (!FLAGS_hdfs.empty()) {
-    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
-  }
-  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
-
-  // The number of background threads should be at least as much the
-  // max number of concurrent compactions.
-  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
-  FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
-                                  rocksdb::Env::Priority::BOTTOM);
-  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) {
-    fprintf(stderr,
-            "Error: prefixpercent is non-zero while prefix_size is "
-            "not positive!\n");
-    exit(1);
-  }
-  if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
-    fprintf(stderr,
-            "Error: please specify prefix_size for "
-            "test_batches_snapshots test!\n");
-    exit(1);
-  }
-  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size <= 0) {
-    fprintf(stderr,
-            "Error: please specify positive prefix_size in order to use "
-            "memtable_prefix_bloom_size_ratio\n");
-    exit(1);
-  }
-  if ((FLAGS_readpercent + FLAGS_prefixpercent +
-       FLAGS_writepercent + FLAGS_delpercent + FLAGS_delrangepercent +
-       FLAGS_iterpercent) != 100) {
-      fprintf(stderr,
-              "Error: Read+Prefix+Write+Delete+DeleteRange+Iterate percents != "
-              "100!\n");
-      exit(1);
-  }
-  if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
-    fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
-    exit(1);
-  }
-  if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
-      fprintf(stderr,
-              "Error: #DB-reopens should be < ops_per_thread\n"
-              "Provided reopens = %d and ops_per_thread = %lu\n",
-              FLAGS_reopen,
-              (unsigned long)FLAGS_ops_per_thread);
-      exit(1);
-  }
-  if (FLAGS_test_batches_snapshots && FLAGS_delrangepercent > 0) {
-    fprintf(stderr, "Error: nonzero delrangepercent unsupported in "
-                    "test_batches_snapshots mode\n");
-    exit(1);
-  }
-  if (FLAGS_active_width > FLAGS_max_key) {
-    fprintf(stderr, "Error: active_width can be at most max_key\n");
-    exit(1);
-  } else if (FLAGS_active_width == 0) {
-    FLAGS_active_width = FLAGS_max_key;
-  }
-  if (FLAGS_value_size_mult * kRandomValueMaxFactor > kValueMaxLen) {
-    fprintf(stderr, "Error: value_size_mult can be at most %d\n",
-            kValueMaxLen / kRandomValueMaxFactor);
-    exit(1);
-  }
-  if (FLAGS_use_merge && FLAGS_nooverwritepercent == 100) {
-    fprintf(
-        stderr,
-        "Error: nooverwritepercent must not be 100 when using merge operands");
-    exit(1);
-  }
-  if (FLAGS_ingest_external_file_one_in > 0 && FLAGS_nooverwritepercent > 0) {
-    fprintf(stderr,
-            "Error: nooverwritepercent must be 0 when using file ingestion\n");
-    exit(1);
-  }
-  if (FLAGS_clear_column_family_one_in > 0 && FLAGS_backup_one_in > 0) {
-    fprintf(stderr,
-            "Error: clear_column_family_one_in must be 0 when using backup\n");
-    exit(1);
-  }
-  if (FLAGS_test_atomic_flush) {
-    FLAGS_atomic_flush = true;
-  }
-  if (FLAGS_read_only) {
-    if (FLAGS_writepercent != 0 || FLAGS_delpercent != 0 ||
-        FLAGS_delrangepercent != 0) {
-      fprintf(stderr, "Error: updates are not supported in read only mode\n");
-      exit(1);
-    } else if (FLAGS_checkpoint_one_in > 0 &&
-               FLAGS_clear_column_family_one_in > 0) {
-      fprintf(stdout,
-              "Warn: checkpoint won't be validated since column families may "
-              "be dropped.\n");
-    }
-  }
-
-  // Choose a location for the test database if none given with --db=<path>
-  if (FLAGS_db.empty()) {
-      std::string default_db_path;
-      rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
-      default_db_path += "/dbstress";
-      FLAGS_db = default_db_path;
-  }
-
-  rocksdb_kill_odds = FLAGS_kill_random_test;
-  rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
-
-  std::unique_ptr<rocksdb::StressTest> stress;
-  if (FLAGS_test_atomic_flush) {
-    stress.reset(new rocksdb::AtomicFlushStressTest());
-  } else if (FLAGS_test_batches_snapshots) {
-    stress.reset(new rocksdb::BatchedOpsStressTest());
-  } else {
-    stress.reset(new rocksdb::NonBatchedOpsStressTest());
-  }
-  if (stress->Run()) {
-    return 0;
-  } else {
-    return 1;
-  }
-}
-
+int main(int argc, char** argv) { return rocksdb::db_stress_tool(argc, argv); }
 #endif  // GFLAGS
diff --git a/tools/db_stress_tool.cc b/tools/db_stress_tool.cc
new file mode 100644
index 00000000000..47943e7f4c6
--- /dev/null
+++ b/tools/db_stress_tool.cc
@@ -0,0 +1,4902 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The test uses an array to compare against values written to the database.
+// Keys written to the array are in 1:1 correspondence to the actual values in
+// the database according to the formula in the function GenerateValue.
+
+// Space is reserved in the array from 0 to FLAGS_max_key and values are
+// randomly written/deleted/read from those positions. During verification we
+// compare all the positions in the array. To shorten/elongate the running
+// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
+// (sometimes also FLAGS_threads).
+//
+// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
+// different behavior. See comment of the flag for details.
+
+#ifdef GFLAGS
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cinttypes>
+#include <exception>
+#include <queue>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "hdfs/env_hdfs.h"
+#include "logging/logging.h"
+#include "monitoring/histogram.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/debug.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/gflags_compat.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+#include "test_util/sync_point.h"
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+#include "test_util/testutil.h"
+
+#include "utilities/merge_operators.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+static const long KB = 1024;
+static const int kRandomValueMaxFactor = 3;
+static const int kValueMaxLen = 100;
+
+static bool ValidateUint32Range(const char* flagname, uint64_t value) {
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
+            (unsigned long)value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_uint64(seed, 2341234, "Seed for PRNG");
+static const bool FLAGS_seed_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
+
+DEFINE_bool(read_only, false, "True if open DB in read-only mode during tests");
+
+DEFINE_int64(max_key, 1 * KB * KB,
+             "Max number of key/values to place in database");
+
+DEFINE_int32(column_families, 10, "Number of column families");
+
+DEFINE_string(
+    options_file, "",
+    "The path to a RocksDB options file.  If specified, then db_stress will "
+    "run with the RocksDB options in the default column family of the "
+    "specified options file. Note that, when an options file is provided, "
+    "db_stress will ignore the flag values for all options that may be passed "
+    "via options file.");
+
+DEFINE_int64(
+    active_width, 0,
+    "Number of keys in active span of the key-range at any given time. The "
+    "span begins with its left endpoint at key 0, gradually moves rightwards, "
+    "and ends with its right endpoint at max_key. If set to 0, active_width "
+    "will be sanitized to be equal to max_key.");
+
+// TODO(noetzli) Add support for single deletes
+DEFINE_bool(test_batches_snapshots, false,
+            "If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
+            " which read/write/delete multiple keys in a batch. In this mode,"
+            " we do not verify db content by comparing the content with the "
+            "pre-allocated array. Instead, we do partial verification inside"
+            " MultiGet() by checking various values in a batch. Benefit of"
+            " this mode:\n"
+            "\t(a) No need to acquire mutexes during writes (less cache "
+            "flushes in multi-core leading to speed up)\n"
+            "\t(b) No long validation at the end (more speed up)\n"
+            "\t(c) Test snapshot and atomicity of batch writes");
+
+DEFINE_bool(atomic_flush, false,
+            "If set, enables atomic flush in the options.\n");
+
+DEFINE_bool(test_cf_consistency, false,
+            "If set, runs the stress test dedicated to verifying writes to "
+            "multiple column families are consistent. Setting this implies "
+            "`atomic_flush=true` is set true if `disable_wal=false`.\n");
+
+DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
+
+DEFINE_int32(ttl, -1,
+             "Opens the db with this ttl value if this is not -1. "
+             "Carefully specify a large value such that verifications on "
+             "deleted values don't fail");
+
+DEFINE_int32(value_size_mult, 8,
+             "Size of value will be this number times rand_int(1,3) bytes");
+
+DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
+
+DEFINE_bool(enable_pipelined_write, false, "Pipeline WAL/memtable writes");
+
+DEFINE_bool(verify_before_write, false, "Verify before write");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_bool(destroy_db_initially, true,
+            "Destroys the database dir before start if this is true");
+
+DEFINE_bool(verbose, false, "Verbose");
+
+DEFINE_bool(progress_reports, true,
+            "If true, db_stress will report number of finished operations");
+
+DEFINE_uint64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
+              "Number of bytes to buffer in all memtables before compacting");
+
+DEFINE_int32(write_buffer_size,
+             static_cast<int32_t>(rocksdb::Options().write_buffer_size),
+             "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+             rocksdb::Options().max_write_buffer_number,
+             "The number of in-memory memtables. "
+             "Each memtable is of size FLAGS_write_buffer_size.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+             rocksdb::Options().min_write_buffer_number_to_merge,
+             "The minimum number of write buffers that will be merged together "
+             "before writing to storage. This is cheap because it is an "
+             "in-memory merge. If this feature is not enabled, then all these "
+             "write buffers are flushed to L0 as separate files and this "
+             "increases read amplification because a get request has to check "
+             "in all of these files. Also, an in-memory merge may result in "
+             "writing less data to storage if there are duplicate records in"
+             " each of these individual write buffers.");
+
+DEFINE_int32(max_write_buffer_number_to_maintain,
+             rocksdb::Options().max_write_buffer_number_to_maintain,
+             "The total maximum number of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
+DEFINE_int64(max_write_buffer_size_to_maintain,
+             rocksdb::Options().max_write_buffer_size_to_maintain,
+             "The total maximum size of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
+DEFINE_double(memtable_prefix_bloom_size_ratio,
+              rocksdb::Options().memtable_prefix_bloom_size_ratio,
+              "creates prefix blooms for memtables, each with size "
+              "`write_buffer_size * memtable_prefix_bloom_size_ratio`.");
+
+DEFINE_bool(memtable_whole_key_filtering,
+            rocksdb::Options().memtable_whole_key_filtering,
+            "Enable whole key filtering in memtables.");
+
+DEFINE_int32(open_files, rocksdb::Options().max_open_files,
+             "Maximum number of files to keep open at the same time "
+             "(use default if == 0)");
+
+DEFINE_int64(compressed_cache_size, -1,
+             "Number of bytes to use as a cache of compressed data."
+             " Negative means use default settings.");
+
+DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, "");
+
+DEFINE_int32(level0_file_num_compaction_trigger,
+             rocksdb::Options().level0_file_num_compaction_trigger,
+             "Level0 compaction start trigger");
+
+DEFINE_int32(level0_slowdown_writes_trigger,
+             rocksdb::Options().level0_slowdown_writes_trigger,
+             "Number of files in level-0 that will slow down writes");
+
+DEFINE_int32(level0_stop_writes_trigger,
+             rocksdb::Options().level0_stop_writes_trigger,
+             "Number of files in level-0 that will trigger put stop.");
+
+DEFINE_int32(block_size,
+             static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
+             "Number of bytes in a block.");
+
+DEFINE_int32(
+    format_version,
+    static_cast<int32_t>(rocksdb::BlockBasedTableOptions().format_version),
+    "Format version of SST files.");
+
+DEFINE_int32(index_block_restart_interval,
+             rocksdb::BlockBasedTableOptions().index_block_restart_interval,
+             "Number of keys between restart points "
+             "for delta encoding of keys in index block.");
+
+DEFINE_int32(max_background_compactions,
+             rocksdb::Options().max_background_compactions,
+             "The maximum number of concurrent background compactions "
+             "that can occur in parallel.");
+
+DEFINE_int32(num_bottom_pri_threads, 0,
+             "The number of threads in the bottom-priority thread pool (used "
+             "by universal compaction only).");
+
+DEFINE_int32(compaction_thread_pool_adjust_interval, 0,
+             "The interval (in milliseconds) to adjust compaction thread pool "
+             "size. Don't change it periodically if the value is 0.");
+
+DEFINE_int32(compaction_thread_pool_variations, 2,
+             "Range of background thread pool size variations when adjusted "
+             "periodically.");
+
+DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
+             "The maximum number of concurrent background flushes "
+             "that can occur in parallel.");
+
+DEFINE_int32(universal_size_ratio, 0,
+             "The ratio of file sizes that trigger"
+             " compaction in universal style");
+
+DEFINE_int32(universal_min_merge_width, 0,
+             "The minimum number of files to "
+             "compact in universal style compaction");
+
+DEFINE_int32(universal_max_merge_width, 0,
+             "The max number of files to compact"
+             " in universal style compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+             "The max size amplification for universal style compaction");
+
+DEFINE_int32(clear_column_family_one_in, 1000000,
+             "With a chance of 1/N, delete a column family and then recreate "
+             "it again. If N == 0, never drop/create column families. "
+             "When test_batches_snapshots is true, this flag has no effect");
+
+DEFINE_int32(set_options_one_in, 0,
+             "With a chance of 1/N, change some random options");
+
+DEFINE_int32(set_in_place_one_in, 0,
+             "With a chance of 1/N, toggle in place support option");
+
+DEFINE_int64(cache_size, 2LL * KB * KB * KB,
+             "Number of bytes to use as a cache of uncompressed data.");
+
+DEFINE_bool(cache_index_and_filter_blocks, false,
+            "True if indexes/filters should be cached in block cache.");
+
+DEFINE_bool(use_clock_cache, false,
+            "Replace default LRU block cache with clock cache.");
+
+DEFINE_uint64(subcompactions, 1,
+              "Maximum number of subcompactions to divide L0-L1 compactions "
+              "into.");
+
+DEFINE_uint64(periodic_compaction_seconds, 1000,
+              "Files older than this value will be picked up for compaction.");
+
+DEFINE_uint64(compaction_ttl, 1000,
+              "Files older than TTL will be compacted to the next level.");
+
+DEFINE_bool(allow_concurrent_memtable_write, false,
+            "Allow multi-writers to update mem tables in parallel.");
+
+DEFINE_bool(enable_write_thread_adaptive_yield, true,
+            "Use a yielding spin loop for brief writer thread waits.");
+
+static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
+
+static bool ValidateInt32Positive(const char* flagname, int32_t value) {
+  if (value < 0) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n", flagname,
+            value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(reopen, 10, "Number of times database reopens");
+static const bool FLAGS_reopen_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
+
+DEFINE_int32(bloom_bits, 10,
+             "Bloom filter bits per key. "
+             "Negative means use default settings.");
+
+DEFINE_bool(use_block_based_filter, false,
+            "use block based filter"
+            "instead of full filter for block based table");
+
+DEFINE_bool(partition_filters, false,
+            "use partitioned filters "
+            "for block-based table");
+
+DEFINE_int32(
+    index_type,
+    static_cast<int32_t>(rocksdb::BlockBasedTableOptions::kBinarySearch),
+    "Type of block-based table index (see `enum IndexType` in table.h)");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+DEFINE_string(secondaries_base, "",
+              "Use this path as the base path for secondary instances.");
+
+DEFINE_bool(enable_secondary, false, "Enable secondary instance.");
+
+DEFINE_string(
+    expected_values_path, "",
+    "File where the array of expected uint32_t values will be stored. If "
+    "provided and non-empty, the DB state will be verified against these "
+    "values after recovery. --max_key and --column_family must be kept the "
+    "same across invocations of this program that use the same "
+    "--expected_values_path.");
+
+DEFINE_bool(verify_checksum, false,
+            "Verify checksum for every block read from storage");
+
+DEFINE_bool(mmap_read, rocksdb::Options().allow_mmap_reads,
+            "Allow reads to occur via mmap-ing files");
+
+DEFINE_bool(mmap_write, rocksdb::Options().allow_mmap_writes,
+            "Allow writes to occur via mmap-ing files");
+
+DEFINE_bool(use_direct_reads, rocksdb::Options().use_direct_reads,
+            "Use O_DIRECT for reading data");
+
+DEFINE_bool(use_direct_io_for_flush_and_compaction,
+            rocksdb::Options().use_direct_io_for_flush_and_compaction,
+            "Use O_DIRECT for writing data");
+
+// Database statistics
+static std::shared_ptr<rocksdb::Statistics> dbstats;
+DEFINE_bool(statistics, false, "Create database statistics");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_int32(kill_random_test, 0,
+             "If non-zero, kill at various points in source code with "
+             "probability 1/this");
+static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
+extern int rocksdb_kill_odds;
+
+DEFINE_string(kill_prefix_blacklist, "",
+              "If non-empty, kill points with prefix in the list given will be"
+              " skipped. Items are comma-separated.");
+extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_uint64(recycle_log_file_num, rocksdb::Options().recycle_log_file_num,
+              "Number of old WAL files to keep around for later recycling");
+
+DEFINE_int64(target_file_size_base, rocksdb::Options().target_file_size_base,
+             "Target level-1 file size for compaction");
+
+DEFINE_int32(target_file_size_multiplier, 1,
+             "A multiplier to compute target level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base,
+              rocksdb::Options().max_bytes_for_level_base,
+              "Max bytes for level-1");
+
+DEFINE_double(max_bytes_for_level_multiplier, 2,
+              "A multiplier to compute max bytes for level-N (N >= 2)");
+
+DEFINE_int32(range_deletion_width, 10,
+             "The width of the range deletion intervals.");
+
+DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
+
+DEFINE_bool(rate_limit_bg_reads, false,
+            "Use options.rate_limiter on compaction reads");
+
+DEFINE_bool(use_txn, false,
+            "Use TransactionDB. Currently the default write policy is "
+            "TxnDBWritePolicy::WRITE_PREPARED");
+
+DEFINE_int32(backup_one_in, 0,
+             "If non-zero, then CreateNewBackup() will be called once for "
+             "every N operations on average.  0 indicates CreateNewBackup() "
+             "is disabled.");
+
+DEFINE_int32(checkpoint_one_in, 0,
+             "If non-zero, then CreateCheckpoint() will be called once for "
+             "every N operations on average.  0 indicates CreateCheckpoint() "
+             "is disabled.");
+
+DEFINE_int32(ingest_external_file_one_in, 0,
+             "If non-zero, then IngestExternalFile() will be called once for "
+             "every N operations on average.  0 indicates IngestExternalFile() "
+             "is disabled.");
+
+DEFINE_int32(ingest_external_file_width, 1000,
+             "The width of the ingested external files.");
+
+DEFINE_int32(compact_files_one_in, 0,
+             "If non-zero, then CompactFiles() will be called once for every N "
+             "operations on average.  0 indicates CompactFiles() is disabled.");
+
+DEFINE_int32(compact_range_one_in, 0,
+             "If non-zero, then CompactRange() will be called once for every N "
+             "operations on average.  0 indicates CompactRange() is disabled.");
+
+DEFINE_int32(flush_one_in, 0,
+             "If non-zero, then Flush() will be called once for every N ops "
+             "on average.  0 indicates calls to Flush() are disabled.");
+
+DEFINE_int32(compact_range_width, 10000,
+             "The width of the ranges passed to CompactRange().");
+
+DEFINE_int32(acquire_snapshot_one_in, 0,
+             "If non-zero, then acquires a snapshot once every N operations on "
+             "average.");
+
+DEFINE_bool(compare_full_db_state_snapshot, false,
+            "If set we compare state of entire db (in one of the threads) with"
+            "each snapshot.");
+
+DEFINE_uint64(snapshot_hold_ops, 0,
+              "If non-zero, then releases snapshots N operations after they're "
+              "acquired.");
+
+DEFINE_bool(use_multiget, false,
+            "If set, use the batched MultiGet API for reads");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+  if (value < 0 || value > 100) {
+    fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n", flagname,
+            value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_int32(readpercent, 10,
+             "Ratio of reads to total workload (expressed as a percentage)");
+static const bool FLAGS_readpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
+
+DEFINE_int32(prefixpercent, 20,
+             "Ratio of prefix iterators to total workload (expressed as a"
+             " percentage)");
+static const bool FLAGS_prefixpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
+
+DEFINE_int32(writepercent, 45,
+             "Ratio of writes to total workload (expressed as a percentage)");
+static const bool FLAGS_writepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
+
+DEFINE_int32(delpercent, 15,
+             "Ratio of deletes to total workload (expressed as a percentage)");
+static const bool FLAGS_delpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
+
+DEFINE_int32(delrangepercent, 0,
+             "Ratio of range deletions to total workload (expressed as a "
+             "percentage). Cannot be used with test_batches_snapshots");
+static const bool FLAGS_delrangepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_delrangepercent, &ValidateInt32Percent);
+
+DEFINE_int32(nooverwritepercent, 60,
+             "Ratio of keys without overwrite to total workload (expressed as "
+             " a percentage)");
+static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent);
+
+DEFINE_int32(iterpercent, 10,
+             "Ratio of iterations to total workload"
+             " (expressed as a percentage)");
+static const bool FLAGS_iterpercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
+
+DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
+static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
+
+namespace {
+enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "none"))
+    return rocksdb::kNoCompression;
+  else if (!strcasecmp(ctype, "snappy"))
+    return rocksdb::kSnappyCompression;
+  else if (!strcasecmp(ctype, "zlib"))
+    return rocksdb::kZlibCompression;
+  else if (!strcasecmp(ctype, "bzip2"))
+    return rocksdb::kBZip2Compression;
+  else if (!strcasecmp(ctype, "lz4"))
+    return rocksdb::kLZ4Compression;
+  else if (!strcasecmp(ctype, "lz4hc"))
+    return rocksdb::kLZ4HCCompression;
+  else if (!strcasecmp(ctype, "xpress"))
+    return rocksdb::kXpressCompression;
+  else if (!strcasecmp(ctype, "zstd"))
+    return rocksdb::kZSTD;
+
+  fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
+  return rocksdb::kSnappyCompression;  // default value
+}
+
+enum rocksdb::ChecksumType StringToChecksumType(const char* ctype) {
+  assert(ctype);
+  auto iter = rocksdb::checksum_type_string_map.find(ctype);
+  if (iter != rocksdb::checksum_type_string_map.end()) {
+    return iter->second;
+  }
+  fprintf(stderr, "Cannot parse checksum type '%s'\n", ctype);
+  return rocksdb::kCRC32c;
+}
+
+std::string ChecksumTypeToString(rocksdb::ChecksumType ctype) {
+  auto iter = std::find_if(
+      rocksdb::checksum_type_string_map.begin(),
+      rocksdb::checksum_type_string_map.end(),
+      [&](const std::pair<std::string, rocksdb::ChecksumType>&
+              name_and_enum_val) { return name_and_enum_val.second == ctype; });
+  assert(iter != rocksdb::checksum_type_string_map.end());
+  return iter->first;
+}
+
+std::vector<std::string> SplitString(std::string src) {
+  std::vector<std::string> ret;
+  if (src.empty()) {
+    return ret;
+  }
+  size_t pos = 0;
+  size_t pos_comma;
+  while ((pos_comma = src.find(',', pos)) != std::string::npos) {
+    ret.push_back(src.substr(pos, pos_comma - pos));
+    pos = pos_comma + 1;
+  }
+  ret.push_back(src.substr(pos, src.length()));
+  return ret;
+}
+}  // namespace
+
+DEFINE_string(compression_type, "snappy",
+              "Algorithm to use to compress the database");
+static enum rocksdb::CompressionType FLAGS_compression_type_e =
+    rocksdb::kSnappyCompression;
+
+DEFINE_int32(compression_max_dict_bytes, 0,
+             "Maximum size of dictionary used to prime the compression "
+             "library.");
+
+DEFINE_int32(compression_zstd_max_train_bytes, 0,
+             "Maximum size of training data passed to zstd's dictionary "
+             "trainer.");
+
+DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
+static enum rocksdb::ChecksumType FLAGS_checksum_type_e = rocksdb::kCRC32c;
+
+DEFINE_string(hdfs, "", "Name of hdfs environment");
+
+DEFINE_string(env_uri, "",
+              "URI for env lookup. Mutually exclusive with --hdfs");
+
+static std::shared_ptr<rocksdb::Env> env_guard;
+// posix or hdfs environment
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
+DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
+static const bool FLAGS_ops_per_thread_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
+
+DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
+static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
+
+DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
+
+DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
+
+DEFINE_int32(secondary_catch_up_one_in, 0,
+             "If non-zero, the secondaries attemp to catch up with the primary "
+             "once for every N operations on average. 0 indicates the "
+             "secondaries do not try to catch up after open.");
+
+static std::shared_ptr<rocksdb::Statistics> dbstats_secondaries;
+
+enum RepFactory { kSkipList, kHashSkipList, kVectorRep };
+
+namespace {
+enum RepFactory StringToRepFactory(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "skip_list"))
+    return kSkipList;
+  else if (!strcasecmp(ctype, "prefix_hash"))
+    return kHashSkipList;
+  else if (!strcasecmp(ctype, "vector"))
+    return kVectorRep;
+
+  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
+  return kSkipList;
+}
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// truncation of constant value on static_cast
+#pragma warning(disable : 4309)
+#endif
+bool GetNextPrefix(const rocksdb::Slice& src, std::string* v) {
+  std::string ret = src.ToString();
+  for (int i = static_cast<int>(ret.size()) - 1; i >= 0; i--) {
+    if (ret[i] != static_cast<char>(255)) {
+      ret[i] = ret[i] + 1;
+      break;
+    } else if (i != 0) {
+      ret[i] = 0;
+    } else {
+      // all FF. No next prefix
+      return false;
+    }
+  }
+  *v = ret;
+  return true;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+}  // namespace
+
+static enum RepFactory FLAGS_rep_factory;
+DEFINE_string(memtablerep, "skip_list", "");
+
+static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+  if (value < -1 || value > 8) {
+    fprintf(stderr, "Invalid value for --%s: %d. -1 <= PrefixSize <= 8\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(prefix_size, 7,
+             "Control the prefix size for HashSkipListRep. "
+             "-1 is disabled.");
+static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+DEFINE_bool(use_merge, false,
+            "On true, replaces all writes with a Merge "
+            "that behaves like a Put");
+
+DEFINE_bool(use_full_merge_v1, false,
+            "On true, use a merge operator that implement the deprecated "
+            "version of FullMerge");
+
+namespace rocksdb {
+
+// convert long to a big-endian slice key
+static std::string Key(int64_t val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (size_t i = 0; i < sizeof(val); ++i) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+static bool GetIntVal(std::string big_endian_key, uint64_t* key_p) {
+  unsigned int size_key = sizeof(*key_p);
+  assert(big_endian_key.size() == size_key);
+  std::string little_endian_key;
+  little_endian_key.resize(size_key);
+  for (size_t i = 0; i < size_key; ++i) {
+    little_endian_key[i] = big_endian_key[size_key - 1 - i];
+  }
+  Slice little_endian_slice = Slice(little_endian_key);
+  return GetFixed64(&little_endian_slice, key_p);
+}
+
+static std::string StringToHex(const std::string& str) {
+  std::string result = "0x";
+  result.append(Slice(str).ToString(true));
+  return result;
+}
+
+class StressTest;
+namespace {
+
+class Stats {
+ private:
+  uint64_t start_;
+  uint64_t finish_;
+  double seconds_;
+  long done_;
+  long gets_;
+  long prefixes_;
+  long writes_;
+  long deletes_;
+  size_t single_deletes_;
+  long iterator_size_sums_;
+  long founds_;
+  long iterations_;
+  long range_deletions_;
+  long covered_by_range_deletions_;
+  long errors_;
+  long num_compact_files_succeed_;
+  long num_compact_files_failed_;
+  int next_report_;
+  size_t bytes_;
+  uint64_t last_op_finish_;
+  HistogramImpl hist_;
+
+ public:
+  Stats() {}
+
+  void Start() {
+    next_report_ = 100;
+    hist_.Clear();
+    done_ = 0;
+    gets_ = 0;
+    prefixes_ = 0;
+    writes_ = 0;
+    deletes_ = 0;
+    single_deletes_ = 0;
+    iterator_size_sums_ = 0;
+    founds_ = 0;
+    iterations_ = 0;
+    range_deletions_ = 0;
+    covered_by_range_deletions_ = 0;
+    errors_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    num_compact_files_succeed_ = 0;
+    num_compact_files_failed_ = 0;
+    start_ = FLAGS_env->NowMicros();
+    last_op_finish_ = start_;
+    finish_ = start_;
+  }
+
+  void Merge(const Stats& other) {
+    hist_.Merge(other.hist_);
+    done_ += other.done_;
+    gets_ += other.gets_;
+    prefixes_ += other.prefixes_;
+    writes_ += other.writes_;
+    deletes_ += other.deletes_;
+    single_deletes_ += other.single_deletes_;
+    iterator_size_sums_ += other.iterator_size_sums_;
+    founds_ += other.founds_;
+    iterations_ += other.iterations_;
+    range_deletions_ += other.range_deletions_;
+    covered_by_range_deletions_ = other.covered_by_range_deletions_;
+    errors_ += other.errors_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    num_compact_files_succeed_ += other.num_compact_files_succeed_;
+    num_compact_files_failed_ += other.num_compact_files_failed_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+  }
+
+  void Stop() {
+    finish_ = FLAGS_env->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void FinishedSingleOp() {
+    if (FLAGS_histogram) {
+      auto now = FLAGS_env->NowMicros();
+      auto micros = now - last_op_finish_;
+      hist_.Add(micros);
+      if (micros > 20000) {
+        fprintf(stdout, "long op: %" PRIu64 " micros%30s\r", micros, "");
+      }
+      last_op_finish_ = now;
+    }
+
+    done_++;
+    if (FLAGS_progress_reports) {
+      if (done_ >= next_report_) {
+        if (next_report_ < 1000)
+          next_report_ += 100;
+        else if (next_report_ < 5000)
+          next_report_ += 500;
+        else if (next_report_ < 10000)
+          next_report_ += 1000;
+        else if (next_report_ < 50000)
+          next_report_ += 5000;
+        else if (next_report_ < 100000)
+          next_report_ += 10000;
+        else if (next_report_ < 500000)
+          next_report_ += 50000;
+        else
+          next_report_ += 100000;
+        fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
+      }
+    }
+  }
+
+  void AddBytesForWrites(long nwrites, size_t nbytes) {
+    writes_ += nwrites;
+    bytes_ += nbytes;
+  }
+
+  void AddGets(long ngets, long nfounds) {
+    founds_ += nfounds;
+    gets_ += ngets;
+  }
+
+  void AddPrefixes(long nprefixes, long count) {
+    prefixes_ += nprefixes;
+    iterator_size_sums_ += count;
+  }
+
+  void AddIterations(long n) { iterations_ += n; }
+
+  void AddDeletes(long n) { deletes_ += n; }
+
+  void AddSingleDeletes(size_t n) { single_deletes_ += n; }
+
+  void AddRangeDeletions(long n) { range_deletions_ += n; }
+
+  void AddCoveredByRangeDeletions(long n) { covered_by_range_deletions_ += n; }
+
+  void AddErrors(long n) { errors_ += n; }
+
+  void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; }
+
+  void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; }
+
+  void Report(const char* name) {
+    std::string extra;
+    if (bytes_ < 1 || done_ < 1) {
+      fprintf(stderr, "No writes or ops?\n");
+      return;
+    }
+
+    double elapsed = (finish_ - start_) * 1e-6;
+    double bytes_mb = bytes_ / 1048576.0;
+    double rate = bytes_mb / elapsed;
+    double throughput = (double)done_ / elapsed;
+
+    fprintf(stdout, "%-12s: ", name);
+    fprintf(stdout, "%.3f micros/op %ld ops/sec\n", seconds_ * 1e6 / done_,
+            (long)throughput);
+    fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
+            "", bytes_mb, rate, (100 * writes_) / done_, done_);
+    fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
+    fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
+    fprintf(stdout, "%-12s: Single deleted %" ROCKSDB_PRIszt " times\n", "",
+            single_deletes_);
+    fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "", gets_,
+            founds_);
+    fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
+    fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
+            iterator_size_sums_);
+    fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
+    fprintf(stdout, "%-12s: Deleted %ld key-ranges\n", "", range_deletions_);
+    fprintf(stdout, "%-12s: Range deletions covered %ld keys\n", "",
+            covered_by_range_deletions_);
+
+    fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
+    fprintf(stdout, "%-12s: %ld CompactFiles() succeed\n", "",
+            num_compact_files_succeed_);
+    fprintf(stdout, "%-12s: %ld CompactFiles() did not succeed\n", "",
+            num_compact_files_failed_);
+
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  // indicates a key may have any value (or not be present) as an operation on
+  // it is incomplete.
+  static const uint32_t UNKNOWN_SENTINEL;
+  // indicates a key should definitely be deleted
+  static const uint32_t DELETION_SENTINEL;
+
+  explicit SharedState(StressTest* stress_test)
+      : cv_(&mu_),
+        seed_(static_cast<uint32_t>(FLAGS_seed)),
+        max_key_(FLAGS_max_key),
+        log2_keys_per_lock_(static_cast<uint32_t>(FLAGS_log2_keys_per_lock)),
+        num_threads_(FLAGS_threads),
+        num_initialized_(0),
+        num_populated_(0),
+        vote_reopen_(0),
+        num_done_(0),
+        start_(false),
+        start_verify_(false),
+        should_stop_bg_thread_(false),
+        bg_thread_finished_(false),
+        stress_test_(stress_test),
+        verification_failure_(false),
+        no_overwrite_ids_(FLAGS_column_families),
+        values_(nullptr),
+        printing_verification_results_(false) {
+    // Pick random keys in each column family that will not experience
+    // overwrite
+
+    printf("Choosing random keys with no overwrite\n");
+    Random64 rnd(seed_);
+    // Start with the identity permutation. Subsequent iterations of
+    // for loop below will start with perm of previous for loop
+    int64_t* permutation = new int64_t[max_key_];
+    for (int64_t i = 0; i < max_key_; i++) {
+      permutation[i] = i;
+    }
+    // Now do the Knuth shuffle
+    int64_t num_no_overwrite_keys = (max_key_ * FLAGS_nooverwritepercent) / 100;
+    // Only need to figure out first num_no_overwrite_keys of permutation
+    no_overwrite_ids_.reserve(num_no_overwrite_keys);
+    for (int64_t i = 0; i < num_no_overwrite_keys; i++) {
+      int64_t rand_index = i + rnd.Next() % (max_key_ - i);
+      // Swap i and rand_index;
+      int64_t temp = permutation[i];
+      permutation[i] = permutation[rand_index];
+      permutation[rand_index] = temp;
+      // Fill no_overwrite_ids_ with the first num_no_overwrite_keys of
+      // permutation
+      no_overwrite_ids_.insert(permutation[i]);
+    }
+    delete[] permutation;
+
+    size_t expected_values_size =
+        sizeof(std::atomic<uint32_t>) * FLAGS_column_families * max_key_;
+    bool values_init_needed = false;
+    Status status;
+    if (!FLAGS_expected_values_path.empty()) {
+      if (!std::atomic<uint32_t>{}.is_lock_free()) {
+        status = Status::InvalidArgument(
+            "Cannot use --expected_values_path on platforms without lock-free "
+            "std::atomic<uint32_t>");
+      }
+      if (status.ok() && FLAGS_clear_column_family_one_in > 0) {
+        status = Status::InvalidArgument(
+            "Cannot use --expected_values_path on when "
+            "--clear_column_family_one_in is greater than zero.");
+      }
+      uint64_t size = 0;
+      if (status.ok()) {
+        status = FLAGS_env->GetFileSize(FLAGS_expected_values_path, &size);
+      }
+      std::unique_ptr<WritableFile> wfile;
+      if (status.ok() && size == 0) {
+        const EnvOptions soptions;
+        status = FLAGS_env->NewWritableFile(FLAGS_expected_values_path, &wfile,
+                                            soptions);
+      }
+      if (status.ok() && size == 0) {
+        std::string buf(expected_values_size, '\0');
+        status = wfile->Append(buf);
+        values_init_needed = true;
+      }
+      if (status.ok()) {
+        status = FLAGS_env->NewMemoryMappedFileBuffer(
+            FLAGS_expected_values_path, &expected_mmap_buffer_);
+      }
+      if (status.ok()) {
+        assert(expected_mmap_buffer_->GetLen() == expected_values_size);
+        values_ = static_cast<std::atomic<uint32_t>*>(
+            expected_mmap_buffer_->GetBase());
+        assert(values_ != nullptr);
+      } else {
+        fprintf(stderr, "Failed opening shared file '%s' with error: %s\n",
+                FLAGS_expected_values_path.c_str(), status.ToString().c_str());
+        assert(values_ == nullptr);
+      }
+    }
+    if (values_ == nullptr) {
+      values_allocation_.reset(
+          new std::atomic<uint32_t>[FLAGS_column_families * max_key_]);
+      values_ = &values_allocation_[0];
+      values_init_needed = true;
+    }
+    assert(values_ != nullptr);
+    if (values_init_needed) {
+      for (int i = 0; i < FLAGS_column_families; ++i) {
+        for (int j = 0; j < max_key_; ++j) {
+          Delete(i, j, false /* pending */);
+        }
+      }
+    }
+
+    if (FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
+      return;
+    }
+
+    long num_locks = static_cast<long>(max_key_ >> log2_keys_per_lock_);
+    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
+      num_locks++;
+    }
+    fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
+    key_locks_.resize(FLAGS_column_families);
+
+    for (int i = 0; i < FLAGS_column_families; ++i) {
+      key_locks_[i].resize(num_locks);
+      for (auto& ptr : key_locks_[i]) {
+        ptr.reset(new port::Mutex);
+      }
+    }
+  }
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() { return &mu_; }
+
+  port::CondVar* GetCondVar() { return &cv_; }
+
+  StressTest* GetStressTest() const { return stress_test_; }
+
+  int64_t GetMaxKey() const { return max_key_; }
+
+  uint32_t GetNumThreads() const { return num_threads_; }
+
+  void IncInitialized() { num_initialized_++; }
+
+  void IncOperated() { num_populated_++; }
+
+  void IncDone() { num_done_++; }
+
+  void IncVotedReopen() { vote_reopen_ = (vote_reopen_ + 1) % num_threads_; }
+
+  bool AllInitialized() const { return num_initialized_ >= num_threads_; }
+
+  bool AllOperated() const { return num_populated_ >= num_threads_; }
+
+  bool AllDone() const { return num_done_ >= num_threads_; }
+
+  bool AllVotedReopen() { return (vote_reopen_ == 0); }
+
+  void SetStart() { start_ = true; }
+
+  void SetStartVerify() { start_verify_ = true; }
+
+  bool Started() const { return start_; }
+
+  bool VerifyStarted() const { return start_verify_; }
+
+  void SetVerificationFailure() { verification_failure_.store(true); }
+
+  bool HasVerificationFailedYet() { return verification_failure_.load(); }
+
+  port::Mutex* GetMutexForKey(int cf, int64_t key) {
+    return key_locks_[cf][key >> log2_keys_per_lock_].get();
+  }
+
+  void LockColumnFamily(int cf) {
+    for (auto& mutex : key_locks_[cf]) {
+      mutex->Lock();
+    }
+  }
+
+  void UnlockColumnFamily(int cf) {
+    for (auto& mutex : key_locks_[cf]) {
+      mutex->Unlock();
+    }
+  }
+
+  std::atomic<uint32_t>& Value(int cf, int64_t key) const {
+    return values_[cf * max_key_ + key];
+  }
+
+  void ClearColumnFamily(int cf) {
+    std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */),
+              DELETION_SENTINEL);
+  }
+
+  // @param pending True if the update may have started but is not yet
+  //    guaranteed finished. This is useful for crash-recovery testing when the
+  //    process may crash before updating the expected values array.
+  void Put(int cf, int64_t key, uint32_t value_base, bool pending) {
+    if (!pending) {
+      // prevent expected-value update from reordering before Write
+      std::atomic_thread_fence(std::memory_order_release);
+    }
+    Value(cf, key).store(pending ? UNKNOWN_SENTINEL : value_base,
+                         std::memory_order_relaxed);
+    if (pending) {
+      // prevent Write from reordering before expected-value update
+      std::atomic_thread_fence(std::memory_order_release);
+    }
+  }
+
+  uint32_t Get(int cf, int64_t key) const { return Value(cf, key); }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  bool Delete(int cf, int64_t key, bool pending) {
+    if (Value(cf, key) == DELETION_SENTINEL) {
+      return false;
+    }
+    Put(cf, key, DELETION_SENTINEL, pending);
+    return true;
+  }
+
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  bool SingleDelete(int cf, int64_t key, bool pending) {
+    return Delete(cf, key, pending);
+  }
+
+  // @param pending See comment above Put()
+  // Returns number of keys deleted by the call.
+  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) {
+    int covered = 0;
+    for (int64_t key = begin_key; key < end_key; ++key) {
+      if (Delete(cf, key, pending)) {
+        ++covered;
+      }
+    }
+    return covered;
+  }
+
+  bool AllowsOverwrite(int64_t key) {
+    return no_overwrite_ids_.find(key) == no_overwrite_ids_.end();
+  }
+
+  bool Exists(int cf, int64_t key) {
+    // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite
+    // is disallowed can't be accidentally added a second time, in which case
+    // SingleDelete wouldn't be able to properly delete the key. It does allow
+    // the case where a SingleDelete might be added which covers nothing, but
+    // that's not a correctness issue.
+    uint32_t expected_value = Value(cf, key).load();
+    return expected_value != DELETION_SENTINEL;
+  }
+
+  uint32_t GetSeed() const { return seed_; }
+
+  void SetShouldStopBgThread() { should_stop_bg_thread_ = true; }
+
+  bool ShoudStopBgThread() { return should_stop_bg_thread_; }
+
+  void SetBgThreadFinish() { bg_thread_finished_ = true; }
+
+  bool BgThreadFinished() const { return bg_thread_finished_; }
+
+  bool ShouldVerifyAtBeginning() const {
+    return expected_mmap_buffer_.get() != nullptr;
+  }
+
+  bool PrintingVerificationResults() {
+    bool tmp = false;
+    return !printing_verification_results_.compare_exchange_strong(
+        tmp, true, std::memory_order_relaxed);
+  }
+
+  void FinishPrintingVerificationResults() {
+    printing_verification_results_.store(false, std::memory_order_relaxed);
+  }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+  const uint32_t seed_;
+  const int64_t max_key_;
+  const uint32_t log2_keys_per_lock_;
+  const int num_threads_;
+  long num_initialized_;
+  long num_populated_;
+  long vote_reopen_;
+  long num_done_;
+  bool start_;
+  bool start_verify_;
+  bool should_stop_bg_thread_;
+  bool bg_thread_finished_;
+  StressTest* stress_test_;
+  std::atomic<bool> verification_failure_;
+
+  // Keys that should not be overwritten
+  std::unordered_set<size_t> no_overwrite_ids_;
+
+  std::atomic<uint32_t>* values_;
+  std::unique_ptr<std::atomic<uint32_t>[]> values_allocation_;
+  // Has to make it owned by a smart ptr as port::Mutex is not copyable
+  // and storing it in the container may require copying depending on the impl.
+  std::vector<std::vector<std::unique_ptr<port::Mutex>>> key_locks_;
+  std::unique_ptr<MemoryMappedFileBuffer> expected_mmap_buffer_;
+  std::atomic<bool> printing_verification_results_;
+};
+
+const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe;
+const uint32_t SharedState::DELETION_SENTINEL = 0xffffffff;
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid;  // 0..n-1
+  Random rand;   // Has different seeds for different threads
+  SharedState* shared;
+  Stats stats;
+  struct SnapshotState {
+    const Snapshot* snapshot;
+    // The cf from which we did a Get at this snapshot
+    int cf_at;
+    // The name of the cf at the time that we did a read
+    std::string cf_at_name;
+    // The key with which we did a Get at this snapshot
+    std::string key;
+    // The status of the Get
+    Status status;
+    // The value of the Get
+    std::string value;
+    // optional state of all keys in the db
+    std::vector<bool>* key_vec;
+  };
+  std::queue<std::pair<uint64_t, SnapshotState>> snapshot_queue;
+
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {}
+};
+
+class DbStressListener : public EventListener {
+ public:
+  DbStressListener(const std::string& db_name,
+                   const std::vector<DbPath>& db_paths,
+                   const std::vector<ColumnFamilyDescriptor>& column_families)
+      : db_name_(db_name),
+        db_paths_(db_paths),
+        column_families_(column_families),
+        num_pending_file_creations_(0) {}
+  virtual ~DbStressListener() { assert(num_pending_file_creations_ == 0); }
+#ifndef ROCKSDB_LITE
+  virtual void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    assert(IsValidColumnFamilyName(info.cf_name));
+    VerifyFilePath(info.file_path);
+    // pretending doing some work here
+    std::this_thread::sleep_for(
+        std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
+  }
+
+  virtual void OnCompactionCompleted(DB* /*db*/,
+                                     const CompactionJobInfo& ci) override {
+    assert(IsValidColumnFamilyName(ci.cf_name));
+    assert(ci.input_files.size() + ci.output_files.size() > 0U);
+    for (const auto& file_path : ci.input_files) {
+      VerifyFilePath(file_path);
+    }
+    for (const auto& file_path : ci.output_files) {
+      VerifyFilePath(file_path);
+    }
+    // pretending doing some work here
+    std::this_thread::sleep_for(
+        std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
+  }
+
+  virtual void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*info*/) override {
+    ++num_pending_file_creations_;
+  }
+  virtual void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    assert(info.db_name == db_name_);
+    assert(IsValidColumnFamilyName(info.cf_name));
+    if (info.file_size) {
+      VerifyFilePath(info.file_path);
+    }
+    assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0);
+    if (info.status.ok() && info.file_size > 0) {
+      assert(info.table_properties.data_size > 0 ||
+             info.table_properties.num_range_deletions > 0);
+      assert(info.table_properties.raw_key_size > 0);
+      assert(info.table_properties.num_entries > 0);
+    }
+    --num_pending_file_creations_;
+  }
+
+ protected:
+  bool IsValidColumnFamilyName(const std::string& cf_name) const {
+    if (cf_name == kDefaultColumnFamilyName) {
+      return true;
+    }
+    // The column family names in the stress tests are numbers.
+    for (size_t i = 0; i < cf_name.size(); ++i) {
+      if (cf_name[i] < '0' || cf_name[i] > '9') {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void VerifyFileDir(const std::string& file_dir) {
+#ifndef NDEBUG
+    if (db_name_ == file_dir) {
+      return;
+    }
+    for (const auto& db_path : db_paths_) {
+      if (db_path.path == file_dir) {
+        return;
+      }
+    }
+    for (auto& cf : column_families_) {
+      for (const auto& cf_path : cf.options.cf_paths) {
+        if (cf_path.path == file_dir) {
+          return;
+        }
+      }
+    }
+    assert(false);
+#else
+    (void)file_dir;
+#endif  // !NDEBUG
+  }
+
+  void VerifyFileName(const std::string& file_name) {
+#ifndef NDEBUG
+    uint64_t file_number;
+    FileType file_type;
+    bool result = ParseFileName(file_name, &file_number, &file_type);
+    assert(result);
+    assert(file_type == kTableFile);
+#else
+    (void)file_name;
+#endif  // !NDEBUG
+  }
+
+  void VerifyFilePath(const std::string& file_path) {
+#ifndef NDEBUG
+    size_t pos = file_path.find_last_of("/");
+    if (pos == std::string::npos) {
+      VerifyFileName(file_path);
+    } else {
+      if (pos > 0) {
+        VerifyFileDir(file_path.substr(0, pos));
+      }
+      VerifyFileName(file_path.substr(pos));
+    }
+#else
+    (void)file_path;
+#endif  // !NDEBUG
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  std::string db_name_;
+  std::vector<DbPath> db_paths_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  std::atomic<int> num_pending_file_creations_;
+};
+
+}  // namespace
+
+class StressTest {
+ public:
+  StressTest()
+      : cache_(NewCache(FLAGS_cache_size)),
+        compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size)),
+        filter_policy_(FLAGS_bloom_bits >= 0
+                           ? FLAGS_use_block_based_filter
+                                 ? NewBloomFilterPolicy(FLAGS_bloom_bits, true)
+                                 : NewBloomFilterPolicy(FLAGS_bloom_bits, false)
+                           : nullptr),
+        db_(nullptr),
+#ifndef ROCKSDB_LITE
+        txn_db_(nullptr),
+#endif
+        new_column_family_name_(1),
+        num_times_reopened_(0),
+        db_preload_finished_(false) {
+    if (FLAGS_destroy_db_initially) {
+      std::vector<std::string> files;
+      FLAGS_env->GetChildren(FLAGS_db, &files);
+      for (unsigned int i = 0; i < files.size(); i++) {
+        if (Slice(files[i]).starts_with("heap-")) {
+          FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
+        }
+      }
+      Options options;
+      options.env = FLAGS_env;
+      Status s = DestroyDB(FLAGS_db, options);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot destroy original db: %s\n",
+                s.ToString().c_str());
+        exit(1);
+      }
+    }
+  }
+
+  virtual ~StressTest() {
+    for (auto cf : column_families_) {
+      delete cf;
+    }
+    column_families_.clear();
+    delete db_;
+
+    assert(secondaries_.size() == secondary_cfh_lists_.size());
+    size_t n = secondaries_.size();
+    for (size_t i = 0; i != n; ++i) {
+      for (auto* cf : secondary_cfh_lists_[i]) {
+        delete cf;
+      }
+      secondary_cfh_lists_[i].clear();
+      delete secondaries_[i];
+    }
+    secondaries_.clear();
+  }
+
+  std::shared_ptr<Cache> NewCache(size_t capacity) {
+    if (capacity <= 0) {
+      return nullptr;
+    }
+    if (FLAGS_use_clock_cache) {
+      auto cache = NewClockCache((size_t)capacity);
+      if (!cache) {
+        fprintf(stderr, "Clock cache not supported.");
+        exit(1);
+      }
+      return cache;
+    } else {
+      return NewLRUCache((size_t)capacity);
+    }
+  }
+
+  bool BuildOptionsTable() {
+    if (FLAGS_set_options_one_in <= 0) {
+      return true;
+    }
+
+    std::unordered_map<std::string, std::vector<std::string>> options_tbl = {
+        {"write_buffer_size",
+         {ToString(options_.write_buffer_size),
+          ToString(options_.write_buffer_size * 2),
+          ToString(options_.write_buffer_size * 4)}},
+        {"max_write_buffer_number",
+         {ToString(options_.max_write_buffer_number),
+          ToString(options_.max_write_buffer_number * 2),
+          ToString(options_.max_write_buffer_number * 4)}},
+        {"arena_block_size",
+         {
+             ToString(options_.arena_block_size),
+             ToString(options_.write_buffer_size / 4),
+             ToString(options_.write_buffer_size / 8),
+         }},
+        {"memtable_huge_page_size", {"0", ToString(2 * 1024 * 1024)}},
+        {"max_successive_merges", {"0", "2", "4"}},
+        {"inplace_update_num_locks", {"100", "200", "300"}},
+        // TODO(ljin): enable test for this option
+        // {"disable_auto_compactions", {"100", "200", "300"}},
+        {"soft_rate_limit", {"0", "0.5", "0.9"}},
+        {"hard_rate_limit", {"0", "1.1", "2.0"}},
+        {"level0_file_num_compaction_trigger",
+         {
+             ToString(options_.level0_file_num_compaction_trigger),
+             ToString(options_.level0_file_num_compaction_trigger + 2),
+             ToString(options_.level0_file_num_compaction_trigger + 4),
+         }},
+        {"level0_slowdown_writes_trigger",
+         {
+             ToString(options_.level0_slowdown_writes_trigger),
+             ToString(options_.level0_slowdown_writes_trigger + 2),
+             ToString(options_.level0_slowdown_writes_trigger + 4),
+         }},
+        {"level0_stop_writes_trigger",
+         {
+             ToString(options_.level0_stop_writes_trigger),
+             ToString(options_.level0_stop_writes_trigger + 2),
+             ToString(options_.level0_stop_writes_trigger + 4),
+         }},
+        {"max_compaction_bytes",
+         {
+             ToString(options_.target_file_size_base * 5),
+             ToString(options_.target_file_size_base * 15),
+             ToString(options_.target_file_size_base * 100),
+         }},
+        {"target_file_size_base",
+         {
+             ToString(options_.target_file_size_base),
+             ToString(options_.target_file_size_base * 2),
+             ToString(options_.target_file_size_base * 4),
+         }},
+        {"target_file_size_multiplier",
+         {
+             ToString(options_.target_file_size_multiplier),
+             "1",
+             "2",
+         }},
+        {"max_bytes_for_level_base",
+         {
+             ToString(options_.max_bytes_for_level_base / 2),
+             ToString(options_.max_bytes_for_level_base),
+             ToString(options_.max_bytes_for_level_base * 2),
+         }},
+        {"max_bytes_for_level_multiplier",
+         {
+             ToString(options_.max_bytes_for_level_multiplier),
+             "1",
+             "2",
+         }},
+        {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
+    };
+
+    options_table_ = std::move(options_tbl);
+
+    for (const auto& iter : options_table_) {
+      options_index_.push_back(iter.first);
+    }
+    return true;
+  }
+
+  bool Run() {
+    uint64_t now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Initializing db_stress\n",
+            FLAGS_env->TimeToString(now / 1000000).c_str());
+    PrintEnv();
+    Open();
+    BuildOptionsTable();
+    SharedState shared(this);
+
+    if (FLAGS_read_only) {
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
+      PreloadDbAndReopenAsReadOnly(FLAGS_max_key, &shared);
+    }
+    uint32_t n = shared.GetNumThreads();
+
+    now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Initializing worker threads\n",
+            FLAGS_env->TimeToString(now / 1000000).c_str());
+    std::vector<ThreadState*> threads(n);
+    for (uint32_t i = 0; i < n; i++) {
+      threads[i] = new ThreadState(i, &shared);
+      FLAGS_env->StartThread(ThreadBody, threads[i]);
+    }
+    ThreadState bg_thread(0, &shared);
+    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
+      FLAGS_env->StartThread(PoolSizeChangeThread, &bg_thread);
+    }
+
+    // Each thread goes through the following states:
+    // initializing -> wait for others to init -> read/populate/depopulate
+    // wait for others to operate -> verify -> done
+
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+      if (shared.ShouldVerifyAtBeginning()) {
+        if (shared.HasVerificationFailedYet()) {
+          printf("Crash-recovery verification failed :(\n");
+        } else {
+          printf("Crash-recovery verification passed :)\n");
+        }
+      }
+
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Starting database operations\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str());
+
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllOperated()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      now = FLAGS_env->NowMicros();
+      if (FLAGS_test_batches_snapshots) {
+        fprintf(stdout, "%s Limited verification already done during gets\n",
+                FLAGS_env->TimeToString((uint64_t)now / 1000000).c_str());
+      } else {
+        fprintf(stdout, "%s Starting verification\n",
+                FLAGS_env->TimeToString((uint64_t)now / 1000000).c_str());
+      }
+
+      shared.SetStartVerify();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+    for (unsigned int i = 1; i < n; i++) {
+      threads[0]->stats.Merge(threads[i]->stats);
+    }
+    threads[0]->stats.Report("Stress Test");
+
+    for (unsigned int i = 0; i < n; i++) {
+      delete threads[i];
+      threads[i] = nullptr;
+    }
+    now = FLAGS_env->NowMicros();
+    if (!FLAGS_test_batches_snapshots && !shared.HasVerificationFailedYet()) {
+      fprintf(stdout, "%s Verification successful\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str());
+    }
+    PrintStatistics();
+
+    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
+      MutexLock l(shared.GetMutex());
+      shared.SetShouldStopBgThread();
+      while (!shared.BgThreadFinished()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+#ifndef ROCKSDB_LITE
+    if (FLAGS_enable_secondary) {
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Start to verify secondaries against primary\n",
+              FLAGS_env->TimeToString(static_cast<uint64_t>(now) / 1000000)
+                  .c_str());
+    }
+    for (size_t k = 0; k != secondaries_.size(); ++k) {
+      Status s = secondaries_[k]->TryCatchUpWithPrimary();
+      if (!s.ok()) {
+        fprintf(stderr, "Secondary failed to catch up with primary\n");
+        return false;
+      }
+      ReadOptions ropts;
+      ropts.total_order_seek = true;
+      // Verify only the default column family since the primary may have
+      // dropped other column families after most recent reopen.
+      std::unique_ptr<Iterator> iter1(db_->NewIterator(ropts));
+      std::unique_ptr<Iterator> iter2(secondaries_[k]->NewIterator(ropts));
+      for (iter1->SeekToFirst(), iter2->SeekToFirst();
+           iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+        if (iter1->key().compare(iter2->key()) != 0 ||
+            iter1->value().compare(iter2->value())) {
+          fprintf(stderr,
+                  "Secondary %d contains different data from "
+                  "primary.\nPrimary: %s : %s\nSecondary: %s : %s\n",
+                  static_cast<int>(k),
+                  iter1->key().ToString(/*hex=*/true).c_str(),
+                  iter1->value().ToString(/*hex=*/true).c_str(),
+                  iter2->key().ToString(/*hex=*/true).c_str(),
+                  iter2->value().ToString(/*hex=*/true).c_str());
+          return false;
+        }
+      }
+      if (iter1->Valid() && !iter2->Valid()) {
+        fprintf(stderr,
+                "Secondary %d record count is smaller than that of primary\n",
+                static_cast<int>(k));
+        return false;
+      } else if (!iter1->Valid() && iter2->Valid()) {
+        fprintf(stderr,
+                "Secondary %d record count is larger than that of primary\n",
+                static_cast<int>(k));
+        return false;
+      }
+    }
+    if (FLAGS_enable_secondary) {
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Verification of secondaries succeeded\n",
+              FLAGS_env->TimeToString(static_cast<uint64_t>(now) / 1000000)
+                  .c_str());
+    }
+#endif  // ROCKSDB_LITE
+
+    if (shared.HasVerificationFailedYet()) {
+      printf("Verification failed :(\n");
+      return false;
+    }
+    return true;
+  }
+
+ protected:
+  static void ThreadBody(void* v) {
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    if (shared->ShouldVerifyAtBeginning()) {
+      thread->shared->GetStressTest()->VerifyDb(thread);
+    }
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetStressTest()->OperateDb(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncOperated();
+      if (shared->AllOperated()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->VerifyStarted()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+
+    thread->shared->GetStressTest()->VerifyDb(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+  }
+
+  static void PoolSizeChangeThread(void* v) {
+    assert(FLAGS_compaction_thread_pool_adjust_interval > 0);
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    while (true) {
+      {
+        MutexLock l(shared->GetMutex());
+        if (shared->ShoudStopBgThread()) {
+          shared->SetBgThreadFinish();
+          shared->GetCondVar()->SignalAll();
+          return;
+        }
+      }
+
+      auto thread_pool_size_base = FLAGS_max_background_compactions;
+      auto thread_pool_size_var = FLAGS_compaction_thread_pool_variations;
+      int new_thread_pool_size =
+          thread_pool_size_base - thread_pool_size_var +
+          thread->rand.Next() % (thread_pool_size_var * 2 + 1);
+      if (new_thread_pool_size < 1) {
+        new_thread_pool_size = 1;
+      }
+      FLAGS_env->SetBackgroundThreads(new_thread_pool_size);
+      // Sleep up to 3 seconds
+      FLAGS_env->SleepForMicroseconds(
+          thread->rand.Next() % FLAGS_compaction_thread_pool_adjust_interval *
+              1000 +
+          1);
+    }
+  }
+
+  static void PrintKeyValue(int cf, uint64_t key, const char* value,
+                            size_t sz) {
+    if (!FLAGS_verbose) {
+      return;
+    }
+    std::string tmp;
+    tmp.reserve(sz * 2 + 16);
+    char buf[4];
+    for (size_t i = 0; i < sz; i++) {
+      snprintf(buf, 4, "%X", value[i]);
+      tmp.append(buf);
+    }
+    fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf,
+            key, sz, tmp.c_str());
+  }
+
+  static int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration) {
+    const double completed_ratio =
+        static_cast<double>(iteration) / FLAGS_ops_per_thread;
+    const int64_t base_key = static_cast<int64_t>(
+        completed_ratio * (FLAGS_max_key - FLAGS_active_width));
+    return base_key + thread->rand.Next() % FLAGS_active_width;
+  }
+
+  static std::vector<int64_t> GenerateNKeys(ThreadState* thread, int num_keys,
+                                            uint64_t iteration) {
+    const double completed_ratio =
+        static_cast<double>(iteration) / FLAGS_ops_per_thread;
+    const int64_t base_key = static_cast<int64_t>(
+        completed_ratio * (FLAGS_max_key - FLAGS_active_width));
+    std::vector<int64_t> keys;
+    keys.reserve(num_keys);
+    int64_t next_key = base_key + thread->rand.Next() % FLAGS_active_width;
+    keys.push_back(next_key);
+    for (int i = 1; i < num_keys; ++i) {
+      // This may result in some duplicate keys
+      next_key = next_key + thread->rand.Next() %
+                                (FLAGS_active_width - (next_key - base_key));
+      keys.push_back(next_key);
+    }
+    return keys;
+  }
+
+  static size_t GenerateValue(uint32_t rand, char* v, size_t max_sz) {
+    size_t value_sz =
+        ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
+    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
+    (void)max_sz;
+    *((uint32_t*)v) = rand;
+    for (size_t i = sizeof(uint32_t); i < value_sz; i++) {
+      v[i] = (char)(rand ^ i);
+    }
+    v[value_sz] = '\0';
+    return value_sz;  // the size of the value set.
+  }
+
+  Status AssertSame(DB* db, ColumnFamilyHandle* cf,
+                    ThreadState::SnapshotState& snap_state) {
+    Status s;
+    if (cf->GetName() != snap_state.cf_at_name) {
+      return s;
+    }
+    ReadOptions ropt;
+    ropt.snapshot = snap_state.snapshot;
+    PinnableSlice exp_v(&snap_state.value);
+    exp_v.PinSelf();
+    PinnableSlice v;
+    s = db->Get(ropt, cf, snap_state.key, &v);
+    if (!s.ok() && !s.IsNotFound()) {
+      return s;
+    }
+    if (snap_state.status != s) {
+      return Status::Corruption(
+          "The snapshot gave inconsistent results for key " +
+          ToString(Hash(snap_state.key.c_str(), snap_state.key.size(), 0)) +
+          " in cf " + cf->GetName() + ": (" + snap_state.status.ToString() +
+          ") vs. (" + s.ToString() + ")");
+    }
+    if (s.ok()) {
+      if (exp_v != v) {
+        return Status::Corruption("The snapshot gave inconsistent values: (" +
+                                  exp_v.ToString() + ") vs. (" + v.ToString() +
+                                  ")");
+      }
+    }
+    if (snap_state.key_vec != nullptr) {
+      // When `prefix_extractor` is set, seeking to beginning and scanning
+      // across prefixes are only supported with `total_order_seek` set.
+      ropt.total_order_seek = true;
+      std::unique_ptr<Iterator> iterator(db->NewIterator(ropt));
+      std::unique_ptr<std::vector<bool>> tmp_bitvec(
+          new std::vector<bool>(FLAGS_max_key));
+      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+        uint64_t key_val;
+        if (GetIntVal(iterator->key().ToString(), &key_val)) {
+          (*tmp_bitvec.get())[key_val] = true;
+        }
+      }
+      if (!std::equal(snap_state.key_vec->begin(), snap_state.key_vec->end(),
+                      tmp_bitvec.get()->begin())) {
+        return Status::Corruption("Found inconsistent keys at this snapshot");
+      }
+    }
+    return Status::OK();
+  }
+
+  // Currently PreloadDb has to be single-threaded.
+  void PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
+                                    SharedState* shared) {
+    WriteOptions write_opts;
+    write_opts.disableWAL = FLAGS_disable_wal;
+    if (FLAGS_sync) {
+      write_opts.sync = true;
+    }
+    char value[100];
+    int cf_idx = 0;
+    Status s;
+    for (auto cfh : column_families_) {
+      for (int64_t k = 0; k != number_of_keys; ++k) {
+        std::string key_str = Key(k);
+        Slice key = key_str;
+        size_t sz = GenerateValue(0 /*value_base*/, value, sizeof(value));
+        Slice v(value, sz);
+        shared->Put(cf_idx, k, 0, true /* pending */);
+
+        if (FLAGS_use_merge) {
+          if (!FLAGS_use_txn) {
+            s = db_->Merge(write_opts, cfh, key, v);
+          } else {
+#ifndef ROCKSDB_LITE
+            Transaction* txn;
+            s = NewTxn(write_opts, &txn);
+            if (s.ok()) {
+              s = txn->Merge(cfh, key, v);
+              if (s.ok()) {
+                s = CommitTxn(txn);
+              }
+            }
+#endif
+          }
+        } else {
+          if (!FLAGS_use_txn) {
+            s = db_->Put(write_opts, cfh, key, v);
+          } else {
+#ifndef ROCKSDB_LITE
+            Transaction* txn;
+            s = NewTxn(write_opts, &txn);
+            if (s.ok()) {
+              s = txn->Put(cfh, key, v);
+              if (s.ok()) {
+                s = CommitTxn(txn);
+              }
+            }
+#endif
+          }
+        }
+
+        shared->Put(cf_idx, k, 0, false /* pending */);
+        if (!s.ok()) {
+          break;
+        }
+      }
+      if (!s.ok()) {
+        break;
+      }
+      ++cf_idx;
+    }
+    if (s.ok()) {
+      s = db_->Flush(FlushOptions(), column_families_);
+    }
+    if (s.ok()) {
+      for (auto cf : column_families_) {
+        delete cf;
+      }
+      column_families_.clear();
+      delete db_;
+      db_ = nullptr;
+#ifndef ROCKSDB_LITE
+      txn_db_ = nullptr;
+#endif
+
+      db_preload_finished_.store(true);
+      auto now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Reopening database in read-only\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str());
+      // Reopen as read-only, can ignore all options related to updates
+      Open();
+    } else {
+      fprintf(stderr, "Failed to preload db");
+      exit(1);
+    }
+  }
+
+  Status SetOptions(ThreadState* thread) {
+    assert(FLAGS_set_options_one_in > 0);
+    std::unordered_map<std::string, std::string> opts;
+    std::string name =
+        options_index_[thread->rand.Next() % options_index_.size()];
+    int value_idx = thread->rand.Next() % options_table_[name].size();
+    if (name == "soft_rate_limit" || name == "hard_rate_limit") {
+      opts["soft_rate_limit"] = options_table_["soft_rate_limit"][value_idx];
+      opts["hard_rate_limit"] = options_table_["hard_rate_limit"][value_idx];
+    } else if (name == "level0_file_num_compaction_trigger" ||
+               name == "level0_slowdown_writes_trigger" ||
+               name == "level0_stop_writes_trigger") {
+      opts["level0_file_num_compaction_trigger"] =
+          options_table_["level0_file_num_compaction_trigger"][value_idx];
+      opts["level0_slowdown_writes_trigger"] =
+          options_table_["level0_slowdown_writes_trigger"][value_idx];
+      opts["level0_stop_writes_trigger"] =
+          options_table_["level0_stop_writes_trigger"][value_idx];
+    } else {
+      opts[name] = options_table_[name][value_idx];
+    }
+
+    int rand_cf_idx = thread->rand.Next() % FLAGS_column_families;
+    auto cfh = column_families_[rand_cf_idx];
+    return db_->SetOptions(cfh, opts);
+  }
+
+#ifndef ROCKSDB_LITE
+  Status NewTxn(WriteOptions& write_opts, Transaction** txn) {
+    if (!FLAGS_use_txn) {
+      return Status::InvalidArgument("NewTxn when FLAGS_use_txn is not set");
+    }
+    static std::atomic<uint64_t> txn_id = {0};
+    TransactionOptions txn_options;
+    *txn = txn_db_->BeginTransaction(write_opts, txn_options);
+    auto istr = std::to_string(txn_id.fetch_add(1));
+    Status s = (*txn)->SetName("xid" + istr);
+    return s;
+  }
+
+  Status CommitTxn(Transaction* txn) {
+    if (!FLAGS_use_txn) {
+      return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set");
+    }
+    Status s = txn->Prepare();
+    if (s.ok()) {
+      s = txn->Commit();
+    }
+    delete txn;
+    return s;
+  }
+#endif
+
+  virtual void OperateDb(ThreadState* thread) {
+    ReadOptions read_opts(FLAGS_verify_checksum, true);
+    WriteOptions write_opts;
+    auto shared = thread->shared;
+    char value[100];
+    std::string from_db;
+    if (FLAGS_sync) {
+      write_opts.sync = true;
+    }
+    write_opts.disableWAL = FLAGS_disable_wal;
+    const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent;
+    const int writeBound = prefixBound + (int)FLAGS_writepercent;
+    const int delBound = writeBound + (int)FLAGS_delpercent;
+    const int delRangeBound = delBound + (int)FLAGS_delrangepercent;
+    const uint64_t ops_per_open = FLAGS_ops_per_thread / (FLAGS_reopen + 1);
+
+    thread->stats.Start();
+    for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+      if (open_cnt != 0) {
+        thread->stats.FinishedSingleOp();
+        MutexLock l(thread->shared->GetMutex());
+        while (!thread->snapshot_queue.empty()) {
+          db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot);
+          delete thread->snapshot_queue.front().second.key_vec;
+          thread->snapshot_queue.pop();
+        }
+        thread->shared->IncVotedReopen();
+        if (thread->shared->AllVotedReopen()) {
+          thread->shared->GetStressTest()->Reopen();
+          thread->shared->GetCondVar()->SignalAll();
+        } else {
+          thread->shared->GetCondVar()->Wait();
+        }
+        // Commenting this out as we don't want to reset stats on each open.
+        // thread->stats.Start();
+      }
+
+      for (uint64_t i = 0; i < ops_per_open; i++) {
+        if (thread->shared->HasVerificationFailedYet()) {
+          break;
+        }
+
+        // Change Options
+        if (FLAGS_set_options_one_in > 0 &&
+            thread->rand.OneIn(FLAGS_set_options_one_in)) {
+          SetOptions(thread);
+        }
+
+        if (FLAGS_set_in_place_one_in > 0 &&
+            thread->rand.OneIn(FLAGS_set_in_place_one_in)) {
+          options_.inplace_update_support ^= options_.inplace_update_support;
+        }
+
+        MaybeClearOneColumnFamily(thread);
+
+#ifndef ROCKSDB_LITE
+        if (FLAGS_compact_files_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_compact_files_one_in) == 0) {
+          auto* random_cf =
+              column_families_[thread->rand.Next() % FLAGS_column_families];
+          rocksdb::ColumnFamilyMetaData cf_meta_data;
+          db_->GetColumnFamilyMetaData(random_cf, &cf_meta_data);
+
+          // Randomly compact up to three consecutive files from a level
+          const int kMaxRetry = 3;
+          for (int attempt = 0; attempt < kMaxRetry; ++attempt) {
+            size_t random_level = thread->rand.Uniform(
+                static_cast<int>(cf_meta_data.levels.size()));
+
+            const auto& files = cf_meta_data.levels[random_level].files;
+            if (files.size() > 0) {
+              size_t random_file_index =
+                  thread->rand.Uniform(static_cast<int>(files.size()));
+              if (files[random_file_index].being_compacted) {
+                // Retry as the selected file is currently being compacted
+                continue;
+              }
+
+              std::vector<std::string> input_files;
+              input_files.push_back(files[random_file_index].name);
+              if (random_file_index > 0 &&
+                  !files[random_file_index - 1].being_compacted) {
+                input_files.push_back(files[random_file_index - 1].name);
+              }
+              if (random_file_index + 1 < files.size() &&
+                  !files[random_file_index + 1].being_compacted) {
+                input_files.push_back(files[random_file_index + 1].name);
+              }
+
+              size_t output_level =
+                  std::min(random_level + 1, cf_meta_data.levels.size() - 1);
+              auto s =
+                  db_->CompactFiles(CompactionOptions(), random_cf, input_files,
+                                    static_cast<int>(output_level));
+              if (!s.ok()) {
+                fprintf(stdout, "Unable to perform CompactFiles(): %s\n",
+                        s.ToString().c_str());
+                thread->stats.AddNumCompactFilesFailed(1);
+              } else {
+                thread->stats.AddNumCompactFilesSucceed(1);
+              }
+              break;
+            }
+          }
+        }
+#endif  // !ROCKSDB_LITE
+        int64_t rand_key = GenerateOneKey(thread, i);
+        int rand_column_family = thread->rand.Next() % FLAGS_column_families;
+        std::string keystr = Key(rand_key);
+        Slice key = keystr;
+        std::unique_ptr<MutexLock> lock;
+        if (ShouldAcquireMutexOnKey()) {
+          lock.reset(new MutexLock(
+              shared->GetMutexForKey(rand_column_family, rand_key)));
+        }
+
+        auto column_family = column_families_[rand_column_family];
+
+        if (FLAGS_compact_range_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_compact_range_one_in) == 0) {
+          int64_t end_key_num;
+          if (port::kMaxInt64 - rand_key < FLAGS_compact_range_width) {
+            end_key_num = port::kMaxInt64;
+          } else {
+            end_key_num = FLAGS_compact_range_width + rand_key;
+          }
+          std::string end_key_buf = Key(end_key_num);
+          Slice end_key(end_key_buf);
+
+          CompactRangeOptions cro;
+          cro.exclusive_manual_compaction =
+              static_cast<bool>(thread->rand.Next() % 2);
+          Status status = db_->CompactRange(cro, column_family, &key, &end_key);
+          if (!status.ok()) {
+            printf("Unable to perform CompactRange(): %s\n",
+                   status.ToString().c_str());
+          }
+        }
+
+        std::vector<int> rand_column_families =
+            GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
+
+        if (FLAGS_flush_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
+          FlushOptions flush_opts;
+          std::vector<ColumnFamilyHandle*> cfhs;
+          std::for_each(
+              rand_column_families.begin(), rand_column_families.end(),
+              [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
+          Status status = db_->Flush(flush_opts, cfhs);
+          if (!status.ok()) {
+            fprintf(stdout, "Unable to perform Flush(): %s\n",
+                    status.ToString().c_str());
+          }
+        }
+
+        std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
+
+        if (FLAGS_ingest_external_file_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_ingest_external_file_one_in) == 0) {
+          TestIngestExternalFile(thread, rand_column_families, rand_keys, lock);
+        }
+
+        if (FLAGS_backup_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_backup_one_in) == 0) {
+          Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Backup/restore gave inconsistent state",
+                              s);
+          }
+        }
+
+        if (FLAGS_checkpoint_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_checkpoint_one_in) == 0) {
+          Status s = TestCheckpoint(thread, rand_column_families, rand_keys);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Checkpoint gave inconsistent state", s);
+          }
+        }
+
+        if (FLAGS_acquire_snapshot_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_acquire_snapshot_one_in) == 0) {
+#ifndef ROCKSDB_LITE
+          auto db_impl = reinterpret_cast<DBImpl*>(db_->GetRootDB());
+          const bool ww_snapshot = thread->rand.OneIn(10);
+          const Snapshot* snapshot =
+              ww_snapshot ? db_impl->GetSnapshotForWriteConflictBoundary()
+                          : db_->GetSnapshot();
+#else
+          const Snapshot* snapshot = db_->GetSnapshot();
+#endif  // !ROCKSDB_LITE
+          ReadOptions ropt;
+          ropt.snapshot = snapshot;
+          std::string value_at;
+          // When taking a snapshot, we also read a key from that snapshot. We
+          // will later read the same key before releasing the snapshot and
+          // verify that the results are the same.
+          auto status_at = db_->Get(ropt, column_family, key, &value_at);
+          std::vector<bool>* key_vec = nullptr;
+
+          if (FLAGS_compare_full_db_state_snapshot && (thread->tid == 0)) {
+            key_vec = new std::vector<bool>(FLAGS_max_key);
+            // When `prefix_extractor` is set, seeking to beginning and scanning
+            // across prefixes are only supported with `total_order_seek` set.
+            ropt.total_order_seek = true;
+            std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
+            for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+              uint64_t key_val;
+              if (GetIntVal(iterator->key().ToString(), &key_val)) {
+                (*key_vec)[key_val] = true;
+              }
+            }
+          }
+
+          ThreadState::SnapshotState snap_state = {
+              snapshot, rand_column_family, column_family->GetName(),
+              keystr,   status_at,          value_at,
+              key_vec};
+          thread->snapshot_queue.emplace(
+              std::min(FLAGS_ops_per_thread - 1, i + FLAGS_snapshot_hold_ops),
+              snap_state);
+        }
+        while (!thread->snapshot_queue.empty() &&
+               i >= thread->snapshot_queue.front().first) {
+          auto snap_state = thread->snapshot_queue.front().second;
+          assert(snap_state.snapshot);
+          // Note: this is unsafe as the cf might be dropped concurrently. But
+          // it is ok since unclean cf drop is cunnrently not supported by write
+          // prepared transactions.
+          Status s =
+              AssertSame(db_, column_families_[snap_state.cf_at], snap_state);
+          if (!s.ok()) {
+            VerificationAbort(shared, "Snapshot gave inconsistent state", s);
+          }
+          db_->ReleaseSnapshot(snap_state.snapshot);
+          delete snap_state.key_vec;
+          thread->snapshot_queue.pop();
+        }
+
+        int prob_op = thread->rand.Uniform(100);
+        // Reset this in case we pick something other than a read op. We don't
+        // want to use a stale value when deciding at the beginning of the loop
+        // whether to vote to reopen
+        if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
+          // OPERATION read
+          if (FLAGS_use_multiget) {
+            // Leave room for one more iteration of the loop with a single key
+            // batch. This is to ensure that each thread does exactly the same
+            // number of ops
+            int multiget_batch_size = static_cast<int>(
+                std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
+                         FLAGS_ops_per_thread - i - 1));
+            // If its the last iteration, ensure that multiget_batch_size is 1
+            multiget_batch_size = std::max(multiget_batch_size, 1);
+            rand_keys = GenerateNKeys(thread, multiget_batch_size, i);
+            TestMultiGet(thread, read_opts, rand_column_families, rand_keys);
+            i += multiget_batch_size - 1;
+          } else {
+            TestGet(thread, read_opts, rand_column_families, rand_keys);
+          }
+        } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
+          // OPERATION prefix scan
+          // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
+          // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
+          // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
+          // prefix
+          TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
+        } else if (prefixBound <= prob_op && prob_op < writeBound) {
+          // OPERATION write
+          TestPut(thread, write_opts, read_opts, rand_column_families,
+                  rand_keys, value, lock);
+        } else if (writeBound <= prob_op && prob_op < delBound) {
+          // OPERATION delete
+          TestDelete(thread, write_opts, rand_column_families, rand_keys, lock);
+        } else if (delBound <= prob_op && prob_op < delRangeBound) {
+          // OPERATION delete range
+          TestDeleteRange(thread, write_opts, rand_column_families, rand_keys,
+                          lock);
+        } else {
+          // OPERATION iterate
+          int num_seeks = static_cast<int>(
+              std::min(static_cast<uint64_t>(thread->rand.Uniform(4)),
+                       FLAGS_ops_per_thread - i - 1));
+          rand_keys = GenerateNKeys(thread, num_seeks, i);
+          i += num_seeks - 1;
+          TestIterate(thread, read_opts, rand_column_families, rand_keys);
+        }
+        thread->stats.FinishedSingleOp();
+#ifndef ROCKSDB_LITE
+        uint32_t tid = thread->tid;
+        assert(secondaries_.empty() ||
+               static_cast<size_t>(tid) < secondaries_.size());
+        if (FLAGS_secondary_catch_up_one_in > 0 &&
+            thread->rand.Uniform(FLAGS_secondary_catch_up_one_in) == 0) {
+          Status s = secondaries_[tid]->TryCatchUpWithPrimary();
+          if (!s.ok()) {
+            VerificationAbort(shared, "Secondary instance failed to catch up",
+                              s);
+            break;
+          }
+        }
+#endif
+      }
+    }
+    while (!thread->snapshot_queue.empty()) {
+      db_->ReleaseSnapshot(thread->snapshot_queue.front().second.snapshot);
+      delete thread->snapshot_queue.front().second.key_vec;
+      thread->snapshot_queue.pop();
+    }
+
+    thread->stats.Stop();
+  }
+
+  virtual void VerifyDb(ThreadState* thread) const = 0;
+
+  virtual void MaybeClearOneColumnFamily(ThreadState* /* thread */) {}
+
+  virtual bool ShouldAcquireMutexOnKey() const { return false; }
+
+  virtual std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */, int rand_column_family) const {
+    return {rand_column_family};
+  }
+
+  virtual std::vector<int64_t> GenerateKeys(int64_t rand_key) const {
+    return {rand_key};
+  }
+
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& read_opts,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& read_opts,
+                         const std::vector<int>& cf_ids,
+                         const std::vector<int64_t>& keys, char (&value)[100],
+                         std::unique_ptr<MutexLock>& lock) = 0;
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& lock) = 0;
+
+  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys,
+                                 std::unique_ptr<MutexLock>& lock) = 0;
+
+  virtual void TestIngestExternalFile(
+      ThreadState* thread, const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys,
+      std::unique_ptr<MutexLock>& lock) = 0;
+
+  // Return a column family handle that mirrors what is pointed by
+  // `column_family_id`, which will be used to validate data to be correct.
+  // By default, the column family itself will be returned.
+  virtual ColumnFamilyHandle* GetControlCfh(ThreadState* /* thread*/,
+                                            int column_family_id) {
+    return column_families_[column_family_id];
+  }
+
+#ifndef ROCKSDB_LITE
+  // Generated a list of keys that close to boundaries of SST keys.
+  // If there isn't any SST file in the DB, return empty list.
+  std::vector<std::string> GetWhiteBoxKeys(ThreadState* thread, DB* db,
+                                           ColumnFamilyHandle* cfh,
+                                           size_t num_keys) {
+    ColumnFamilyMetaData cfmd;
+    db->GetColumnFamilyMetaData(cfh, &cfmd);
+    std::vector<std::string> boundaries;
+    for (const LevelMetaData& lmd : cfmd.levels) {
+      for (const SstFileMetaData& sfmd : lmd.files) {
+        boundaries.push_back(sfmd.smallestkey);
+        boundaries.push_back(sfmd.largestkey);
+      }
+    }
+    if (boundaries.empty()) {
+      return {};
+    }
+
+    std::vector<std::string> ret;
+    for (size_t j = 0; j < num_keys; j++) {
+      std::string k =
+          boundaries[thread->rand.Uniform(static_cast<int>(boundaries.size()))];
+      if (thread->rand.OneIn(3)) {
+        // Reduce one byte from the string
+        for (int i = static_cast<int>(k.length()) - 1; i >= 0; i--) {
+          uint8_t cur = k[i];
+          if (cur > 0) {
+            k[i] = static_cast<char>(cur - 1);
+            break;
+          } else if (i > 0) {
+            k[i] = 0xFF;
+          }
+        }
+      } else if (thread->rand.OneIn(2)) {
+        // Add one byte to the string
+        for (int i = static_cast<int>(k.length()) - 1; i >= 0; i--) {
+          uint8_t cur = k[i];
+          if (cur < 255) {
+            k[i] = static_cast<char>(cur + 1);
+            break;
+          } else if (i > 0) {
+            k[i] = 0x00;
+          }
+        }
+      }
+      ret.push_back(k);
+    }
+    return ret;
+  }
+#else // !ROCKSDB_LITE
+std::vector<std::string> GetWhiteBoxKeys(ThreadState*, DB*,
+                                         ColumnFamilyHandle*,
+                                         size_t) {
+  // Not supported in LITE mode.
+  return {};
+}
+#endif // !ROCKSDB_LITE
+
+  // Given a key K, this creates an iterator which scans to K and then
+  // does a random sequence of Next/Prev operations.
+  virtual Status TestIterate(ThreadState* thread, const ReadOptions& read_opts,
+                             const std::vector<int>& rand_column_families,
+                             const std::vector<int64_t>& rand_keys) {
+    Status s;
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions readoptionscopy = read_opts;
+    readoptionscopy.snapshot = snapshot;
+
+    if (thread->rand.OneIn(16)) {
+      // When prefix extractor is used, it's useful to cover total order seek.
+      readoptionscopy.total_order_seek = true;
+    }
+
+    std::string upper_bound_str;
+    Slice upper_bound;
+    if (thread->rand.OneIn(16)) {
+      // in 1/16 chance, set a iterator upper bound
+      int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+      upper_bound_str = Key(rand_upper_key);
+      upper_bound = Slice(upper_bound_str);
+      // uppder_bound can be smaller than seek key, but the query itself
+      // should not crash either.
+      readoptionscopy.iterate_upper_bound = &upper_bound;
+    }
+    std::string lower_bound_str;
+    Slice lower_bound;
+    if (thread->rand.OneIn(16)) {
+      // in 1/16 chance, enable iterator lower bound
+      int64_t rand_lower_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+      lower_bound_str = Key(rand_lower_key);
+      lower_bound = Slice(lower_bound_str);
+      // uppder_bound can be smaller than seek key, but the query itself
+      // should not crash either.
+      readoptionscopy.iterate_lower_bound = &lower_bound;
+    }
+
+    auto cfh = column_families_[rand_column_families[0]];
+    std::unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, cfh));
+
+    std::vector<std::string> key_str;
+    if (thread->rand.OneIn(16)) {
+      // Generate keys close to lower or upper bound of SST files.
+      key_str = GetWhiteBoxKeys(thread, db_, cfh, rand_keys.size());
+    }
+    if (key_str.empty()) {
+      // If key string is not geneerated using white block keys,
+      // Use randomized key passe in.
+      for (int64_t rkey : rand_keys) {
+        key_str.push_back(Key(rkey));
+      }
+    }
+
+    for (const std::string& skey : key_str) {
+      Slice key = skey;
+
+      if (readoptionscopy.iterate_upper_bound != nullptr &&
+          thread->rand.OneIn(2)) {
+        // 1/2 chance, change the upper bound.
+        // It is possible that it is changed without first use, but there is no
+        // problem with that.
+        int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+        upper_bound_str = Key(rand_upper_key);
+        upper_bound = Slice(upper_bound_str);
+      }
+
+      // Set up an iterator and does the same without bounds and with total
+      // order seek and compare the results. This is to identify bugs related
+      // to bounds, prefix extractor or reseeking. Sometimes we are comparing
+      // iterators with the same set-up, and it doesn't hurt to check them
+      // to be equal.
+      ReadOptions cmp_ro;
+      cmp_ro.snapshot = snapshot;
+      cmp_ro.total_order_seek = true;
+      ColumnFamilyHandle* cmp_cfh =
+          GetControlCfh(thread, rand_column_families[0]);
+      std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cmp_cfh));
+      bool diverged = false;
+
+      LastIterateOp last_op;
+      if (thread->rand.OneIn(8)) {
+        iter->SeekForPrev(key);
+        cmp_iter->SeekForPrev(key);
+        last_op = kLastOpSeekForPrev;
+      } else {
+        iter->Seek(key);
+        cmp_iter->Seek(key);
+        last_op = kLastOpSeek;
+      }
+      VerifyIterator(thread, cmp_cfh, readoptionscopy, iter.get(),
+                     cmp_iter.get(), last_op, key, &diverged);
+
+      bool no_reverse =
+          (FLAGS_memtablerep == "prefix_hash" && !read_opts.total_order_seek &&
+           options_.prefix_extractor.get() != nullptr);
+      for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
+        if (no_reverse || thread->rand.OneIn(2)) {
+          iter->Next();
+          if (!diverged) {
+            assert(cmp_iter->Valid());
+            cmp_iter->Next();
+          }
+        } else {
+          iter->Prev();
+          if (!diverged) {
+            assert(cmp_iter->Valid());
+            cmp_iter->Prev();
+          }
+        }
+        last_op = kLastOpNextOrPrev;
+        VerifyIterator(thread, cmp_cfh, readoptionscopy, iter.get(),
+                       cmp_iter.get(), last_op, key, &diverged);
+      }
+
+      if (s.ok()) {
+        thread->stats.AddIterations(1);
+      } else {
+        thread->stats.AddErrors(1);
+        break;
+      }
+    }
+
+    db_->ReleaseSnapshot(snapshot);
+
+    return s;
+  }
+
+  // Enum used by VerifyIterator() to identify the mode to validate.
+  enum LastIterateOp { kLastOpSeek, kLastOpSeekForPrev, kLastOpNextOrPrev };
+
+  // Compare the two iterator, iter and cmp_iter are in the same position,
+  // unless iter might be made invalidate or undefined because of
+  // upper or lower bounds, or prefix extractor.
+  // Will flag failure if the verification fails.
+  // diverged = true if the two iterator is already diverged.
+  // True if verification passed, false if not.
+  void VerifyIterator(ThreadState* thread, ColumnFamilyHandle* cmp_cfh,
+                      const ReadOptions& ro, Iterator* iter, Iterator* cmp_iter,
+                      LastIterateOp op, const Slice& seek_key, bool* diverged) {
+    if (*diverged) {
+      return;
+    }
+
+    if (op == kLastOpSeek && ro.iterate_lower_bound != nullptr &&
+        (options_.comparator->Compare(*ro.iterate_lower_bound, seek_key) >= 0 ||
+         (ro.iterate_upper_bound != nullptr &&
+          options_.comparator->Compare(*ro.iterate_lower_bound,
+                                       *ro.iterate_upper_bound) >= 0))) {
+      // Lower bound behavior is not well defined if it is larger than
+      // seek key or upper bound. Disable the check for now.
+      *diverged = true;
+      return;
+    }
+
+    if (op == kLastOpSeekForPrev && ro.iterate_upper_bound != nullptr &&
+        (options_.comparator->Compare(*ro.iterate_upper_bound, seek_key) <= 0 ||
+         (ro.iterate_lower_bound != nullptr &&
+          options_.comparator->Compare(*ro.iterate_lower_bound,
+                                       *ro.iterate_upper_bound) >= 0))) {
+      // Uppder bound behavior is not well defined if it is smaller than
+      // seek key or lower bound. Disable the check for now.
+      *diverged = true;
+      return;
+    }
+
+    if (iter->Valid() && !cmp_iter->Valid()) {
+      fprintf(stderr,
+              "Control interator is invalid but iterator has key %s seek key "
+              "%s\n",
+              iter->key().ToString(true).c_str(),
+              seek_key.ToString(true).c_str());
+      if (ro.iterate_upper_bound != nullptr) {
+        fprintf(stderr, "upper bound %s\n",
+                ro.iterate_upper_bound->ToString(true).c_str());
+      }
+      if (ro.iterate_lower_bound != nullptr) {
+        fprintf(stderr, "lower bound %s\n",
+                ro.iterate_lower_bound->ToString(true).c_str());
+      }
+
+      *diverged = true;
+    } else if (cmp_iter->Valid()) {
+      // Iterator is not valid. It can be legimate if it has already been
+      // out of upper or lower bound, or filtered out by prefix iterator.
+      const Slice& total_order_key = cmp_iter->key();
+      const SliceTransform* pe =
+          ro.total_order_seek ? nullptr : options_.prefix_extractor.get();
+      const Comparator* cmp = options_.comparator;
+
+      if (pe != nullptr) {
+        if (!pe->InDomain(seek_key)) {
+          // Prefix seek a non-in-domain key is undefined. Skip checking for
+          // this scenario.
+          *diverged = true;
+          return;
+        }
+
+        if (!pe->InDomain(total_order_key) ||
+            pe->Transform(total_order_key) != pe->Transform(seek_key)) {
+          // If the prefix is exhausted, the only thing needs to check
+          // is the iterator isn't return a position in prefix.
+          // Either way, checking can stop from here.
+          *diverged = true;
+          if (!iter->Valid() || !pe->InDomain(iter->key()) ||
+              pe->Transform(iter->key()) != pe->Transform(seek_key)) {
+            return;
+          }
+          fprintf(stderr,
+                  "Iterator stays in prefix bug contol doesn't"
+                  " seek key %s iterator key %s control iterator key %s\n",
+                  seek_key.ToString(true).c_str(),
+                  iter->key().ToString(true).c_str(),
+                  cmp_iter->key().ToString(true).c_str());
+        }
+      }
+      // Check upper or lower bounds.
+      if (!*diverged) {
+        if ((iter->Valid() && iter->key() != cmp_iter->key()) ||
+            (!iter->Valid() &&
+             (ro.iterate_upper_bound == nullptr ||
+              cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) &&
+             (ro.iterate_lower_bound == nullptr ||
+              cmp->Compare(total_order_key, *ro.iterate_lower_bound) > 0))) {
+          fprintf(stderr,
+                  "Iterator diverged from control iterator which"
+                  " has value %s seek key %s\n",
+                  total_order_key.ToString(true).c_str(),
+                  seek_key.ToString(true).c_str());
+          if (iter->Valid()) {
+            fprintf(stderr, "iterator has value %s\n",
+                    iter->key().ToString(true).c_str());
+          } else {
+            fprintf(stderr, "iterator is not valid\n");
+          }
+          if (ro.iterate_upper_bound != nullptr) {
+            fprintf(stderr, "upper bound %s\n",
+                    ro.iterate_upper_bound->ToString(true).c_str());
+          }
+          if (ro.iterate_lower_bound != nullptr) {
+            fprintf(stderr, "lower bound %s\n",
+                    ro.iterate_lower_bound->ToString(true).c_str());
+          }
+          *diverged = true;
+        }
+      }
+    }
+    if (*diverged) {
+      fprintf(stderr, "Control CF %s\n", cmp_cfh->GetName().c_str());
+      thread->stats.AddErrors(1);
+      // Fail fast to preserve the DB state.
+      thread->shared->SetVerificationFailure();
+    }
+  }
+
+#ifdef ROCKSDB_LITE
+  virtual Status TestBackupRestore(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestBackupRestore\n");
+    std::terminate();
+  }
+
+  virtual Status TestCheckpoint(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestCheckpoint\n");
+    std::terminate();
+  }
+#else   // ROCKSDB_LITE
+  virtual Status TestBackupRestore(ThreadState* thread,
+                                   const std::vector<int>& rand_column_families,
+                                   const std::vector<int64_t>& rand_keys) {
+    // Note the column families chosen by `rand_column_families` cannot be
+    // dropped while the locks for `rand_keys` are held. So we should not have
+    // to worry about accessing those column families throughout this function.
+    assert(rand_column_families.size() == rand_keys.size());
+    std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
+    std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
+    BackupableDBOptions backup_opts(backup_dir);
+    BackupEngine* backup_engine = nullptr;
+    Status s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
+    if (s.ok()) {
+      s = backup_engine->CreateNewBackup(db_);
+    }
+    if (s.ok()) {
+      delete backup_engine;
+      backup_engine = nullptr;
+      s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
+    }
+    if (s.ok()) {
+      s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
+                                                   restore_dir /* wal_dir */);
+    }
+    if (s.ok()) {
+      s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
+    }
+    DB* restored_db = nullptr;
+    std::vector<ColumnFamilyHandle*> restored_cf_handles;
+    if (s.ok()) {
+      Options restore_options(options_);
+      restore_options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      assert(FLAGS_clear_column_family_one_in == 0);
+      for (auto name : column_family_names_) {
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
+      }
+      s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
+                   &restored_cf_handles, &restored_db);
+    }
+    // for simplicity, currently only verifies existence/non-existence of a few
+    // keys
+    for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
+      std::string key_str = Key(rand_keys[i]);
+      Slice key = key_str;
+      std::string restored_value;
+      Status get_status = restored_db->Get(
+          ReadOptions(), restored_cf_handles[rand_column_families[i]], key,
+          &restored_value);
+      bool exists =
+          thread->shared->Exists(rand_column_families[i], rand_keys[i]);
+      if (get_status.ok()) {
+        if (!exists) {
+          s = Status::Corruption(
+              "key exists in restore but not in original db");
+        }
+      } else if (get_status.IsNotFound()) {
+        if (exists) {
+          s = Status::Corruption(
+              "key exists in original db but not in restore");
+        }
+      } else {
+        s = get_status;
+      }
+    }
+    if (backup_engine != nullptr) {
+      delete backup_engine;
+      backup_engine = nullptr;
+    }
+    if (restored_db != nullptr) {
+      for (auto* cf_handle : restored_cf_handles) {
+        restored_db->DestroyColumnFamilyHandle(cf_handle);
+      }
+      delete restored_db;
+      restored_db = nullptr;
+    }
+    if (!s.ok()) {
+      printf("A backup/restore operation failed with: %s\n",
+             s.ToString().c_str());
+    }
+    return s;
+  }
+
+  virtual Status TestCheckpoint(ThreadState* thread,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    // Note the column families chosen by `rand_column_families` cannot be
+    // dropped while the locks for `rand_keys` are held. So we should not have
+    // to worry about accessing those column families throughout this function.
+    assert(rand_column_families.size() == rand_keys.size());
+    std::string checkpoint_dir =
+        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
+    Options tmp_opts(options_);
+    tmp_opts.listeners.clear();
+    DestroyDB(checkpoint_dir, tmp_opts);
+    Checkpoint* checkpoint = nullptr;
+    Status s = Checkpoint::Create(db_, &checkpoint);
+    if (s.ok()) {
+      s = checkpoint->CreateCheckpoint(checkpoint_dir);
+    }
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* checkpoint_db = nullptr;
+    if (s.ok()) {
+      delete checkpoint;
+      checkpoint = nullptr;
+      Options options(options_);
+      options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      if (FLAGS_clear_column_family_one_in == 0) {
+        for (const auto& name : column_family_names_) {
+          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
+        }
+        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
+                                &cf_handles, &checkpoint_db);
+      }
+    }
+    if (checkpoint_db != nullptr) {
+      for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
+        std::string key_str = Key(rand_keys[i]);
+        Slice key = key_str;
+        std::string value;
+        Status get_status = checkpoint_db->Get(
+            ReadOptions(), cf_handles[rand_column_families[i]], key, &value);
+        bool exists =
+            thread->shared->Exists(rand_column_families[i], rand_keys[i]);
+        if (get_status.ok()) {
+          if (!exists) {
+            s = Status::Corruption(
+                "key exists in checkpoint but not in original db");
+          }
+        } else if (get_status.IsNotFound()) {
+          if (exists) {
+            s = Status::Corruption(
+                "key exists in original db but not in checkpoint");
+          }
+        } else {
+          s = get_status;
+        }
+      }
+      for (auto cfh : cf_handles) {
+        delete cfh;
+      }
+      cf_handles.clear();
+      delete checkpoint_db;
+      checkpoint_db = nullptr;
+    }
+    DestroyDB(checkpoint_dir, tmp_opts);
+    if (!s.ok()) {
+      fprintf(stderr, "A checkpoint operation failed with: %s\n",
+              s.ToString().c_str());
+    }
+    return s;
+  }
+#endif  // ROCKSDB_LITE
+
+  void VerificationAbort(SharedState* shared, std::string msg, Status s) const {
+    printf("Verification failed: %s. Status is %s\n", msg.c_str(),
+           s.ToString().c_str());
+    shared->SetVerificationFailure();
+  }
+
+  void VerificationAbort(SharedState* shared, std::string msg, int cf,
+                         int64_t key) const {
+    printf("Verification failed for column family %d key %" PRIi64 ": %s\n", cf,
+           key, msg.c_str());
+    shared->SetVerificationFailure();
+  }
+
+  void PrintEnv() const {
+    fprintf(stdout, "RocksDB version           : %d.%d\n", kMajorVersion,
+            kMinorVersion);
+    fprintf(stdout, "Format version            : %d\n", FLAGS_format_version);
+    fprintf(stdout, "TransactionDB             : %s\n",
+            FLAGS_use_txn ? "true" : "false");
+    fprintf(stdout, "Read only mode            : %s\n",
+            FLAGS_read_only ? "true" : "false");
+    fprintf(stdout, "Atomic flush              : %s\n",
+            FLAGS_atomic_flush ? "true" : "false");
+    fprintf(stdout, "Column families           : %d\n", FLAGS_column_families);
+    if (!FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "Clear CFs one in          : %d\n",
+              FLAGS_clear_column_family_one_in);
+    }
+    fprintf(stdout, "Number of threads         : %d\n", FLAGS_threads);
+    fprintf(stdout, "Ops per thread            : %lu\n",
+            (unsigned long)FLAGS_ops_per_thread);
+    std::string ttl_state("unused");
+    if (FLAGS_ttl > 0) {
+      ttl_state = NumberToString(FLAGS_ttl);
+    }
+    fprintf(stdout, "Time to live(sec)         : %s\n", ttl_state.c_str());
+    fprintf(stdout, "Read percentage           : %d%%\n", FLAGS_readpercent);
+    fprintf(stdout, "Prefix percentage         : %d%%\n", FLAGS_prefixpercent);
+    fprintf(stdout, "Write percentage          : %d%%\n", FLAGS_writepercent);
+    fprintf(stdout, "Delete percentage         : %d%%\n", FLAGS_delpercent);
+    fprintf(stdout, "Delete range percentage   : %d%%\n",
+            FLAGS_delrangepercent);
+    fprintf(stdout, "No overwrite percentage   : %d%%\n",
+            FLAGS_nooverwritepercent);
+    fprintf(stdout, "Iterate percentage        : %d%%\n", FLAGS_iterpercent);
+    fprintf(stdout, "DB-write-buffer-size      : %" PRIu64 "\n",
+            FLAGS_db_write_buffer_size);
+    fprintf(stdout, "Write-buffer-size         : %d\n",
+            FLAGS_write_buffer_size);
+    fprintf(stdout, "Iterations                : %lu\n",
+            (unsigned long)FLAGS_num_iterations);
+    fprintf(stdout, "Max key                   : %lu\n",
+            (unsigned long)FLAGS_max_key);
+    fprintf(stdout, "Ratio #ops/#keys          : %f\n",
+            (1.0 * FLAGS_ops_per_thread * FLAGS_threads) / FLAGS_max_key);
+    fprintf(stdout, "Num times DB reopens      : %d\n", FLAGS_reopen);
+    fprintf(stdout, "Batches/snapshots         : %d\n",
+            FLAGS_test_batches_snapshots);
+    fprintf(stdout, "Do update in place        : %d\n", FLAGS_in_place_update);
+    fprintf(stdout, "Num keys per lock         : %d\n",
+            1 << FLAGS_log2_keys_per_lock);
+    std::string compression = CompressionTypeToString(FLAGS_compression_type_e);
+    fprintf(stdout, "Compression               : %s\n", compression.c_str());
+    std::string checksum = ChecksumTypeToString(FLAGS_checksum_type_e);
+    fprintf(stdout, "Checksum type             : %s\n", checksum.c_str());
+    fprintf(stdout, "Max subcompactions        : %" PRIu64 "\n",
+            FLAGS_subcompactions);
+    fprintf(stdout, "Use MultiGet              : %s\n",
+            FLAGS_use_multiget ? "true" : "false");
+
+    const char* memtablerep = "";
+    switch (FLAGS_rep_factory) {
+      case kSkipList:
+        memtablerep = "skip_list";
+        break;
+      case kHashSkipList:
+        memtablerep = "prefix_hash";
+        break;
+      case kVectorRep:
+        memtablerep = "vector";
+        break;
+    }
+
+    fprintf(stdout, "Memtablerep               : %s\n", memtablerep);
+
+    fprintf(stdout, "Test kill odd             : %d\n", rocksdb_kill_odds);
+    if (!rocksdb_kill_prefix_blacklist.empty()) {
+      fprintf(stdout, "Skipping kill points prefixes:\n");
+      for (auto& p : rocksdb_kill_prefix_blacklist) {
+        fprintf(stdout, "  %s\n", p.c_str());
+      }
+    }
+    fprintf(stdout, "Periodic Compaction Secs  : %" PRIu64 "\n",
+            FLAGS_periodic_compaction_seconds);
+    fprintf(stdout, "Compaction TTL            : %" PRIu64 "\n",
+            FLAGS_compaction_ttl);
+
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void Open() {
+    assert(db_ == nullptr);
+#ifndef ROCKSDB_LITE
+    assert(txn_db_ == nullptr);
+#endif
+    if (FLAGS_options_file.empty()) {
+      BlockBasedTableOptions block_based_options;
+      block_based_options.block_cache = cache_;
+      block_based_options.cache_index_and_filter_blocks =
+          FLAGS_cache_index_and_filter_blocks;
+      block_based_options.block_cache_compressed = compressed_cache_;
+      block_based_options.checksum = FLAGS_checksum_type_e;
+      block_based_options.block_size = FLAGS_block_size;
+      block_based_options.format_version =
+          static_cast<uint32_t>(FLAGS_format_version);
+      block_based_options.index_block_restart_interval =
+          static_cast<int32_t>(FLAGS_index_block_restart_interval);
+      block_based_options.filter_policy = filter_policy_;
+      block_based_options.partition_filters = FLAGS_partition_filters;
+      block_based_options.index_type =
+          static_cast<BlockBasedTableOptions::IndexType>(FLAGS_index_type);
+      options_.table_factory.reset(
+          NewBlockBasedTableFactory(block_based_options));
+      options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
+      options_.write_buffer_size = FLAGS_write_buffer_size;
+      options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
+      options_.min_write_buffer_number_to_merge =
+          FLAGS_min_write_buffer_number_to_merge;
+      options_.max_write_buffer_number_to_maintain =
+          FLAGS_max_write_buffer_number_to_maintain;
+      options_.max_write_buffer_size_to_maintain =
+          FLAGS_max_write_buffer_size_to_maintain;
+      options_.memtable_prefix_bloom_size_ratio =
+          FLAGS_memtable_prefix_bloom_size_ratio;
+      options_.memtable_whole_key_filtering =
+          FLAGS_memtable_whole_key_filtering;
+      options_.max_background_compactions = FLAGS_max_background_compactions;
+      options_.max_background_flushes = FLAGS_max_background_flushes;
+      options_.compaction_style =
+          static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
+      if (FLAGS_prefix_size >= 0) {
+        options_.prefix_extractor.reset(
+            NewFixedPrefixTransform(FLAGS_prefix_size));
+      }
+      options_.max_open_files = FLAGS_open_files;
+      options_.statistics = dbstats;
+      options_.env = FLAGS_env;
+      options_.use_fsync = FLAGS_use_fsync;
+      options_.compaction_readahead_size = FLAGS_compaction_readahead_size;
+      options_.allow_mmap_reads = FLAGS_mmap_read;
+      options_.allow_mmap_writes = FLAGS_mmap_write;
+      options_.use_direct_reads = FLAGS_use_direct_reads;
+      options_.use_direct_io_for_flush_and_compaction =
+          FLAGS_use_direct_io_for_flush_and_compaction;
+      options_.recycle_log_file_num =
+          static_cast<size_t>(FLAGS_recycle_log_file_num);
+      options_.target_file_size_base = FLAGS_target_file_size_base;
+      options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+      options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+      options_.max_bytes_for_level_multiplier =
+          FLAGS_max_bytes_for_level_multiplier;
+      options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+      options_.level0_slowdown_writes_trigger =
+          FLAGS_level0_slowdown_writes_trigger;
+      options_.level0_file_num_compaction_trigger =
+          FLAGS_level0_file_num_compaction_trigger;
+      options_.compression = FLAGS_compression_type_e;
+      options_.compression_opts.max_dict_bytes =
+          FLAGS_compression_max_dict_bytes;
+      options_.compression_opts.zstd_max_train_bytes =
+          FLAGS_compression_zstd_max_train_bytes;
+      options_.create_if_missing = true;
+      options_.max_manifest_file_size = FLAGS_max_manifest_file_size;
+      options_.inplace_update_support = FLAGS_in_place_update;
+      options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
+      options_.allow_concurrent_memtable_write =
+          FLAGS_allow_concurrent_memtable_write;
+      options_.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+      options_.ttl = FLAGS_compaction_ttl;
+      options_.enable_pipelined_write = FLAGS_enable_pipelined_write;
+      options_.enable_write_thread_adaptive_yield =
+          FLAGS_enable_write_thread_adaptive_yield;
+      options_.compaction_options_universal.size_ratio =
+          FLAGS_universal_size_ratio;
+      options_.compaction_options_universal.min_merge_width =
+          FLAGS_universal_min_merge_width;
+      options_.compaction_options_universal.max_merge_width =
+          FLAGS_universal_max_merge_width;
+      options_.compaction_options_universal.max_size_amplification_percent =
+          FLAGS_universal_max_size_amplification_percent;
+      options_.atomic_flush = FLAGS_atomic_flush;
+    } else {
+#ifdef ROCKSDB_LITE
+      fprintf(stderr, "--options_file not supported in lite mode\n");
+      exit(1);
+#else
+      DBOptions db_options;
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      Status s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_options,
+                                     &cf_descriptors);
+      db_options.env = FLAGS_env;
+      if (!s.ok()) {
+        fprintf(stderr, "Unable to load options file %s --- %s\n",
+                FLAGS_options_file.c_str(), s.ToString().c_str());
+        exit(1);
+      }
+      options_ = Options(db_options, cf_descriptors[0].options);
+#endif  // ROCKSDB_LITE
+    }
+
+    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
+      options_.rate_limiter.reset(NewGenericRateLimiter(
+          FLAGS_rate_limiter_bytes_per_sec, 1000 /* refill_period_us */,
+          10 /* fairness */,
+          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
+                                    : RateLimiter::Mode::kWritesOnly));
+      if (FLAGS_rate_limit_bg_reads) {
+        options_.new_table_reader_for_compaction_inputs = true;
+      }
+    }
+
+    if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) {
+      fprintf(stderr,
+              "prefeix_size cannot be zero if memtablerep == prefix_hash\n");
+      exit(1);
+    }
+    if (FLAGS_prefix_size != 0 && FLAGS_rep_factory != kHashSkipList) {
+      fprintf(stderr,
+              "WARNING: prefix_size is non-zero but "
+              "memtablerep != prefix_hash\n");
+    }
+    switch (FLAGS_rep_factory) {
+      case kSkipList:
+        // no need to do anything
+        break;
+#ifndef ROCKSDB_LITE
+      case kHashSkipList:
+        options_.memtable_factory.reset(NewHashSkipListRepFactory(10000));
+        break;
+      case kVectorRep:
+        options_.memtable_factory.reset(new VectorRepFactory());
+        break;
+#else
+      default:
+        fprintf(stderr,
+                "RocksdbLite only supports skip list mem table. Skip "
+                "--rep_factory\n");
+#endif  // ROCKSDB_LITE
+    }
+
+    if (FLAGS_use_full_merge_v1) {
+      options_.merge_operator = MergeOperators::CreateDeprecatedPutOperator();
+    } else {
+      options_.merge_operator = MergeOperators::CreatePutOperator();
+    }
+
+    fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+    Status s;
+    if (FLAGS_ttl == -1) {
+      std::vector<std::string> existing_column_families;
+      s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
+                                 &existing_column_families);  // ignore errors
+      if (!s.ok()) {
+        // DB doesn't exist
+        assert(existing_column_families.empty());
+        assert(column_family_names_.empty());
+        column_family_names_.push_back(kDefaultColumnFamilyName);
+      } else if (column_family_names_.empty()) {
+        // this is the first call to the function Open()
+        column_family_names_ = existing_column_families;
+      } else {
+        // this is a reopen. just assert that existing column_family_names are
+        // equivalent to what we remember
+        auto sorted_cfn = column_family_names_;
+        std::sort(sorted_cfn.begin(), sorted_cfn.end());
+        std::sort(existing_column_families.begin(),
+                  existing_column_families.end());
+        if (sorted_cfn != existing_column_families) {
+          fprintf(stderr,
+                  "Expected column families differ from the existing:\n");
+          printf("Expected: {");
+          for (auto cf : sorted_cfn) {
+            printf("%s ", cf.c_str());
+          }
+          printf("}\n");
+          printf("Existing: {");
+          for (auto cf : existing_column_families) {
+            printf("%s ", cf.c_str());
+          }
+          printf("}\n");
+        }
+        assert(sorted_cfn == existing_column_families);
+      }
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      for (auto name : column_family_names_) {
+        if (name != kDefaultColumnFamilyName) {
+          new_column_family_name_ =
+              std::max(new_column_family_name_.load(), std::stoi(name) + 1);
+        }
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
+      }
+      while (cf_descriptors.size() < (size_t)FLAGS_column_families) {
+        std::string name = ToString(new_column_family_name_.load());
+        new_column_family_name_++;
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
+        column_family_names_.push_back(name);
+      }
+      options_.listeners.clear();
+      options_.listeners.emplace_back(
+          new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors));
+      options_.create_missing_column_families = true;
+      if (!FLAGS_use_txn) {
+        if (db_preload_finished_.load() && FLAGS_read_only) {
+          s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors,
+                                  &column_families_, &db_);
+        } else {
+          s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
+                       &column_families_, &db_);
+        }
+      } else {
+#ifndef ROCKSDB_LITE
+        TransactionDBOptions txn_db_options;
+        // For the moment it is sufficient to test WRITE_PREPARED policy
+        txn_db_options.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
+        s = TransactionDB::Open(options_, txn_db_options, FLAGS_db,
+                                cf_descriptors, &column_families_, &txn_db_);
+        db_ = txn_db_;
+        // after a crash, rollback to commit recovered transactions
+        std::vector<Transaction*> trans;
+        txn_db_->GetAllPreparedTransactions(&trans);
+        Random rand(static_cast<uint32_t>(FLAGS_seed));
+        for (auto txn : trans) {
+          if (rand.OneIn(2)) {
+            s = txn->Commit();
+            assert(s.ok());
+          } else {
+            s = txn->Rollback();
+            assert(s.ok());
+          }
+          delete txn;
+        }
+        trans.clear();
+        txn_db_->GetAllPreparedTransactions(&trans);
+        assert(trans.size() == 0);
+#endif
+      }
+      assert(!s.ok() || column_families_.size() ==
+                            static_cast<size_t>(FLAGS_column_families));
+
+      if (FLAGS_enable_secondary) {
+#ifndef ROCKSDB_LITE
+        secondaries_.resize(FLAGS_threads);
+        std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
+        secondary_cfh_lists_.clear();
+        secondary_cfh_lists_.resize(FLAGS_threads);
+        Options tmp_opts;
+        tmp_opts.max_open_files = FLAGS_open_files;
+        tmp_opts.statistics = dbstats_secondaries;
+        tmp_opts.env = FLAGS_env;
+        for (size_t i = 0; i != static_cast<size_t>(FLAGS_threads); ++i) {
+          const std::string secondary_path =
+              FLAGS_secondaries_base + "/" + std::to_string(i);
+          s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path,
+                                  cf_descriptors, &secondary_cfh_lists_[i],
+                                  &secondaries_[i]);
+          if (!s.ok()) {
+            break;
+          }
+        }
+#else
+        fprintf(stderr, "Secondary is not supported in RocksDBLite\n");
+        exit(1);
+#endif
+      }
+    } else {
+#ifndef ROCKSDB_LITE
+      DBWithTTL* db_with_ttl;
+      s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
+      db_ = db_with_ttl;
+      if (FLAGS_enable_secondary) {
+        secondaries_.resize(FLAGS_threads);
+        std::fill(secondaries_.begin(), secondaries_.end(), nullptr);
+        Options tmp_opts;
+        tmp_opts.env = options_.env;
+        tmp_opts.max_open_files = FLAGS_open_files;
+        for (size_t i = 0; i != static_cast<size_t>(FLAGS_threads); ++i) {
+          const std::string secondary_path =
+              FLAGS_secondaries_base + "/" + std::to_string(i);
+          s = DB::OpenAsSecondary(tmp_opts, FLAGS_db, secondary_path,
+                                  &secondaries_[i]);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+#else
+      fprintf(stderr, "TTL is not supported in RocksDBLite\n");
+      exit(1);
+#endif
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void Reopen() {
+    for (auto cf : column_families_) {
+      delete cf;
+    }
+    column_families_.clear();
+    delete db_;
+    db_ = nullptr;
+#ifndef ROCKSDB_LITE
+    txn_db_ = nullptr;
+#endif
+
+    assert(secondaries_.size() == secondary_cfh_lists_.size());
+    size_t n = secondaries_.size();
+    for (size_t i = 0; i != n; ++i) {
+      for (auto* cf : secondary_cfh_lists_[i]) {
+        delete cf;
+      }
+      secondary_cfh_lists_[i].clear();
+      delete secondaries_[i];
+    }
+    secondaries_.clear();
+
+    num_times_reopened_++;
+    auto now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Reopening database for the %dth time\n",
+            FLAGS_env->TimeToString(now / 1000000).c_str(),
+            num_times_reopened_);
+    Open();
+  }
+
+  void PrintStatistics() {
+    if (dbstats) {
+      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+    }
+    if (dbstats_secondaries) {
+      fprintf(stdout, "Secondary instances STATISTICS:\n%s\n",
+              dbstats_secondaries->ToString().c_str());
+    }
+  }
+
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> compressed_cache_;
+  std::shared_ptr<const FilterPolicy> filter_policy_;
+  DB* db_;
+#ifndef ROCKSDB_LITE
+  TransactionDB* txn_db_;
+#endif
+  Options options_;
+  std::vector<ColumnFamilyHandle*> column_families_;
+  std::vector<std::string> column_family_names_;
+  std::atomic<int> new_column_family_name_;
+  int num_times_reopened_;
+  std::unordered_map<std::string, std::vector<std::string>> options_table_;
+  std::vector<std::string> options_index_;
+  std::atomic<bool> db_preload_finished_;
+
+  // Fields used for stress-testing secondary instance in the same process
+  std::vector<DB*> secondaries_;
+  std::vector<std::vector<ColumnFamilyHandle*>> secondary_cfh_lists_;
+};
+
+class NonBatchedOpsStressTest : public StressTest {
+ public:
+  NonBatchedOpsStressTest() {}
+
+  virtual ~NonBatchedOpsStressTest() {}
+
+  virtual void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    auto shared = thread->shared;
+    const int64_t max_key = shared->GetMaxKey();
+    const int64_t keys_per_thread = max_key / shared->GetNumThreads();
+    int64_t start = keys_per_thread * thread->tid;
+    int64_t end = start + keys_per_thread;
+    uint64_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 1 : static_cast<size_t>(FLAGS_prefix_size);
+    if (thread->tid == shared->GetNumThreads() - 1) {
+      end = max_key;
+    }
+    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+      if (!thread->rand.OneIn(2)) {
+        // Use iterator to verify this range
+        std::unique_ptr<Iterator> iter(
+            db_->NewIterator(options, column_families_[cf]));
+        iter->Seek(Key(start));
+        for (auto i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          // TODO(ljin): update "long" to uint64_t
+          // Reseek when the prefix changes
+          if (prefix_to_use > 0 &&
+              i % (static_cast<int64_t>(1) << 8 * (8 - prefix_to_use)) == 0) {
+            iter->Seek(Key(i));
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = iter->status();
+          if (iter->Valid()) {
+            if (iter->key().compare(k) > 0) {
+              s = Status::NotFound(Slice());
+            } else if (iter->key().compare(k) == 0) {
+              from_db = iter->value().ToString();
+              iter->Next();
+            } else if (iter->key().compare(k) < 0) {
+              VerificationAbort(shared, "An out of range key was found",
+                                static_cast<int>(cf), i);
+            }
+          } else {
+            // The iterator found no value for the key in question, so do not
+            // move to the next item in the iterator
+            s = Status::NotFound(Slice());
+          }
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
+          if (from_db.length()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
+          }
+        }
+      } else {
+        // Use Get to verify this range
+        for (auto i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = db_->Get(options, column_families_[cf], k, &from_db);
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
+          if (from_db.length()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
+          }
+        }
+      }
+    }
+  }
+
+  virtual void MaybeClearOneColumnFamily(ThreadState* thread) {
+    if (FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
+      if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
+        // drop column family and then create it again (can't drop default)
+        int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
+        std::string new_name = ToString(new_column_family_name_.fetch_add(1));
+        {
+          MutexLock l(thread->shared->GetMutex());
+          fprintf(
+              stdout,
+              "[CF %d] Dropping and recreating column family. new name: %s\n",
+              cf, new_name.c_str());
+        }
+        thread->shared->LockColumnFamily(cf);
+        Status s = db_->DropColumnFamily(column_families_[cf]);
+        delete column_families_[cf];
+        if (!s.ok()) {
+          fprintf(stderr, "dropping column family error: %s\n",
+                  s.ToString().c_str());
+          std::terminate();
+        }
+        s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
+                                    &column_families_[cf]);
+        column_family_names_[cf] = new_name;
+        thread->shared->ClearColumnFamily(cf);
+        if (!s.ok()) {
+          fprintf(stderr, "creating column family error: %s\n",
+                  s.ToString().c_str());
+          std::terminate();
+        }
+        thread->shared->UnlockColumnFamily(cf);
+      }
+    }
+  }
+
+  virtual bool ShouldAcquireMutexOnKey() const { return true; }
+
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) {
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    std::string from_db;
+    Status s = db_->Get(read_opts, cfh, key, &from_db);
+    if (s.ok()) {
+      // found case
+      thread->stats.AddGets(1, 1);
+    } else if (s.IsNotFound()) {
+      // not found case
+      thread->stats.AddGets(1, 0);
+    } else {
+      // errors case
+      thread->stats.AddErrors(1);
+    }
+    return s;
+  }
+
+  virtual std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    size_t num_keys = rand_keys.size();
+    std::vector<std::string> key_str;
+    std::vector<Slice> keys;
+    key_str.reserve(num_keys);
+    keys.reserve(num_keys);
+    std::vector<PinnableSlice> values(num_keys);
+    std::vector<Status> statuses(num_keys);
+    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      key_str.emplace_back(Key(rand_keys[i]));
+      keys.emplace_back(key_str.back());
+    }
+    db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(),
+                  statuses.data());
+    for (const auto& s : statuses) {
+      if (s.ok()) {
+        // found case
+        thread->stats.AddGets(1, 1);
+      } else if (s.IsNotFound()) {
+        // not found case
+        thread->stats.AddGets(1, 0);
+      } else {
+        // errors case
+        thread->stats.AddErrors(1);
+      }
+    }
+    return statuses;
+  }
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& read_opts,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    Slice prefix = Slice(key.data(), FLAGS_prefix_size);
+
+    std::string upper_bound;
+    Slice ub_slice;
+    ReadOptions ro_copy = read_opts;
+    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
+      // For half of the time, set the upper bound to the next prefix
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+
+    Iterator* iter = db_->NewIterator(ro_copy, cfh);
+    long count = 0;
+    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
+         iter->Next()) {
+      ++count;
+    }
+    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
+    Status s = iter->status();
+    if (iter->status().ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    delete iter;
+    return s;
+  }
+
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& read_opts,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys,
+                         char (&value)[100], std::unique_ptr<MutexLock>& lock) {
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    while (!shared->AllowsOverwrite(rand_key) &&
+           (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
+      lock.reset();
+      rand_key = thread->rand.Next() % max_key;
+      rand_column_family = thread->rand.Next() % FLAGS_column_families;
+      lock.reset(
+          new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+    }
+
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    ColumnFamilyHandle* cfh = column_families_[rand_column_family];
+
+    if (FLAGS_verify_before_write) {
+      std::string key_str2 = Key(rand_key);
+      Slice k = key_str2;
+      std::string from_db;
+      Status s = db_->Get(read_opts, cfh, k, &from_db);
+      if (!VerifyValue(rand_column_family, rand_key, read_opts, shared, from_db,
+                       s, true)) {
+        return s;
+      }
+    }
+    uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
+    size_t sz = GenerateValue(value_base, value, sizeof(value));
+    Slice v(value, sz);
+    shared->Put(rand_column_family, rand_key, value_base, true /* pending */);
+    Status s;
+    if (FLAGS_use_merge) {
+      if (!FLAGS_use_txn) {
+        s = db_->Merge(write_opts, cfh, key, v);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Merge(cfh, key, v);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+    } else {
+      if (!FLAGS_use_txn) {
+        s = db_->Put(write_opts, cfh, key, v);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Put(cfh, key, v);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+    }
+    shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
+    if (!s.ok()) {
+      fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+      std::terminate();
+    }
+    thread->stats.AddBytesForWrites(1, sz);
+    PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
+                  sz);
+    return s;
+  }
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& lock) {
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+
+    // OPERATION delete
+    // If the chosen key does not allow overwrite and it does not exist,
+    // choose another key.
+    while (!shared->AllowsOverwrite(rand_key) &&
+           !shared->Exists(rand_column_family, rand_key)) {
+      lock.reset();
+      rand_key = thread->rand.Next() % max_key;
+      rand_column_family = thread->rand.Next() % FLAGS_column_families;
+      lock.reset(
+          new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+    }
+
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_family];
+
+    // Use delete if the key may be overwritten and a single deletion
+    // otherwise.
+    Status s;
+    if (shared->AllowsOverwrite(rand_key)) {
+      shared->Delete(rand_column_family, rand_key, true /* pending */);
+      if (!FLAGS_use_txn) {
+        s = db_->Delete(write_opts, cfh, key);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Delete(cfh, key);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+      shared->Delete(rand_column_family, rand_key, false /* pending */);
+      thread->stats.AddDeletes(1);
+      if (!s.ok()) {
+        fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
+    } else {
+      shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
+      if (!FLAGS_use_txn) {
+        s = db_->SingleDelete(write_opts, cfh, key);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->SingleDelete(cfh, key);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+      shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
+      thread->stats.AddSingleDeletes(1);
+      if (!s.ok()) {
+        fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
+    }
+    return s;
+  }
+
+  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys,
+                                 std::unique_ptr<MutexLock>& lock) {
+    // OPERATION delete range
+    std::vector<std::unique_ptr<MutexLock>> range_locks;
+    // delete range does not respect disallowed overwrites. the keys for
+    // which overwrites are disallowed are randomly distributed so it
+    // could be expensive to find a range where each key allows
+    // overwrites.
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    if (rand_key > max_key - FLAGS_range_deletion_width) {
+      lock.reset();
+      rand_key =
+          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
+      range_locks.emplace_back(
+          new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+    } else {
+      range_locks.emplace_back(std::move(lock));
+    }
+    for (int j = 1; j < FLAGS_range_deletion_width; ++j) {
+      if (((rand_key + j) & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
+        range_locks.emplace_back(new MutexLock(
+            shared->GetMutexForKey(rand_column_family, rand_key + j)));
+      }
+    }
+    shared->DeleteRange(rand_column_family, rand_key,
+                        rand_key + FLAGS_range_deletion_width,
+                        true /* pending */);
+
+    std::string keystr = Key(rand_key);
+    Slice key = keystr;
+    auto cfh = column_families_[rand_column_family];
+    std::string end_keystr = Key(rand_key + FLAGS_range_deletion_width);
+    Slice end_key = end_keystr;
+    Status s = db_->DeleteRange(write_opts, cfh, key, end_key);
+    if (!s.ok()) {
+      fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
+      std::terminate();
+    }
+    int covered = shared->DeleteRange(rand_column_family, rand_key,
+                                      rand_key + FLAGS_range_deletion_width,
+                                      false /* pending */);
+    thread->stats.AddRangeDeletions(1);
+    thread->stats.AddCoveredByRangeDeletions(covered);
+    return s;
+  }
+
+#ifdef ROCKSDB_LITE
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestIngestExternalFile\n");
+    std::terminate();
+  }
+#else
+  virtual void TestIngestExternalFile(
+      ThreadState* thread, const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys, std::unique_ptr<MutexLock>& lock) {
+    const std::string sst_filename =
+        FLAGS_db + "/." + ToString(thread->tid) + ".sst";
+    Status s;
+    if (FLAGS_env->FileExists(sst_filename).ok()) {
+      // Maybe we terminated abnormally before, so cleanup to give this file
+      // ingestion a clean slate
+      s = FLAGS_env->DeleteFile(sst_filename);
+    }
+
+    SstFileWriter sst_file_writer(EnvOptions(options_), options_);
+    if (s.ok()) {
+      s = sst_file_writer.Open(sst_filename);
+    }
+    int64_t key_base = rand_keys[0];
+    int column_family = rand_column_families[0];
+    std::vector<std::unique_ptr<MutexLock>> range_locks;
+    std::vector<uint32_t> values;
+    SharedState* shared = thread->shared;
+
+    // Grab locks, set pending state on expected values, and add keys
+    for (int64_t key = key_base;
+         s.ok() && key < std::min(key_base + FLAGS_ingest_external_file_width,
+                                  shared->GetMaxKey());
+         ++key) {
+      if (key == key_base) {
+        range_locks.emplace_back(std::move(lock));
+      } else if ((key & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
+        range_locks.emplace_back(
+            new MutexLock(shared->GetMutexForKey(column_family, key)));
+      }
+
+      uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
+      values.push_back(value_base);
+      shared->Put(column_family, key, value_base, true /* pending */);
+
+      char value[100];
+      size_t value_len = GenerateValue(value_base, value, sizeof(value));
+      auto key_str = Key(key);
+      s = sst_file_writer.Put(Slice(key_str), Slice(value, value_len));
+    }
+
+    if (s.ok()) {
+      s = sst_file_writer.Finish();
+    }
+    if (s.ok()) {
+      s = db_->IngestExternalFile(column_families_[column_family],
+                                  {sst_filename}, IngestExternalFileOptions());
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "file ingestion error: %s\n", s.ToString().c_str());
+      std::terminate();
+    }
+    int64_t key = key_base;
+    for (int32_t value : values) {
+      shared->Put(column_family, key, value, false /* pending */);
+      ++key;
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool VerifyValue(int cf, int64_t key, const ReadOptions& /*opts*/,
+                   SharedState* shared, const std::string& value_from_db,
+                   Status s, bool strict = false) const {
+    if (shared->HasVerificationFailedYet()) {
+      return false;
+    }
+    // compare value_from_db with the value in the shared state
+    char value[kValueMaxLen];
+    uint32_t value_base = shared->Get(cf, key);
+    if (value_base == SharedState::UNKNOWN_SENTINEL) {
+      return true;
+    }
+    if (value_base == SharedState::DELETION_SENTINEL && !strict) {
+      return true;
+    }
+
+    if (s.ok()) {
+      if (value_base == SharedState::DELETION_SENTINEL) {
+        VerificationAbort(shared, "Unexpected value found", cf, key);
+        return false;
+      }
+      size_t sz = GenerateValue(value_base, value, sizeof(value));
+      if (value_from_db.length() != sz) {
+        VerificationAbort(shared, "Length of value read is not equal", cf, key);
+        return false;
+      }
+      if (memcmp(value_from_db.data(), value, sz) != 0) {
+        VerificationAbort(shared, "Contents of value read don't match", cf,
+                          key);
+        return false;
+      }
+    } else {
+      if (value_base != SharedState::DELETION_SENTINEL) {
+        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+class BatchedOpsStressTest : public StressTest {
+ public:
+  BatchedOpsStressTest() {}
+  virtual ~BatchedOpsStressTest() {}
+
+  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
+  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
+  // Also refer BatchedOpsStressTest::TestGet
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& /* read_opts */,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys,
+                         char (&value)[100],
+                         std::unique_ptr<MutexLock>& /* lock */) {
+    uint32_t value_base =
+        thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL;
+    size_t sz = GenerateValue(value_base, value, sizeof(value));
+    Slice v(value, sz);
+    std::string keys[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"};
+    std::string values[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"};
+    Slice value_slices[10];
+    WriteBatch batch;
+    Status s;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key_str;
+      values[i] += v.ToString();
+      value_slices[i] = values[i];
+      if (FLAGS_use_merge) {
+        batch.Merge(cfh, keys[i], value_slices[i]);
+      } else {
+        batch.Put(cfh, keys[i], value_slices[i]);
+      }
+    }
+
+    s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      // we did 10 writes each of size sz + 1
+      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
+  // in DB atomically i.e in a single batch. Also refer MultiGet.
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& writeoptions,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& /* lock */) {
+    std::string keys[10] = {"9", "7", "5", "3", "1", "8", "6", "4", "2", "0"};
+
+    WriteBatch batch;
+    Status s;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key_str;
+      batch.Delete(cfh, keys[i]);
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(10);
+    }
+
+    return s;
+  }
+
+  virtual Status TestDeleteRange(
+      ThreadState* /* thread */, WriteOptions& /* write_opts */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    return Status::NotSupported(
+        "BatchedOpsStressTest does not support "
+        "TestDeleteRange");
+  }
+
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "BatchedOpsStressTest does not support "
+            "TestIngestExternalFile\n");
+    std::terminate();
+  }
+
+  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
+  // in the same snapshot, and verifies that all the values are of the form
+  // "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that BatchedOpsStressTest::TestPut was used to put (K, V) into
+  // the DB.
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) {
+    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    Slice key_slices[10];
+    std::string values[10];
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = db_->GetSnapshot();
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string from_db;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      key_slices[i] = keys[i];
+      s = db_->Get(readoptionscopy, cfh, key_slices[i], &from_db);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        thread->stats.AddErrors(1);
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+        thread->stats.AddGets(1, 0);
+      } else {
+        values[i] = from_db;
+
+        char expected_prefix = (keys[i])[0];
+        char actual_prefix = (values[i])[0];
+        if (actual_prefix != expected_prefix) {
+          fprintf(stderr, "error expected prefix = %c actual = %c\n",
+                  expected_prefix, actual_prefix);
+        }
+        (values[i])[0] = ' ';  // blank out the differing character
+        thread->stats.AddGets(1, 1);
+      }
+    }
+    db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    // Now that we retrieved all values, check that they all match
+    for (int i = 1; i < 10; i++) {
+      if (values[i] != values[0]) {
+        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
+                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
+                StringToHex(values[i]).c_str());
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      }
+    }
+
+    return s;
+  }
+
+  virtual std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& readoptions,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    size_t num_keys = rand_keys.size();
+    std::vector<Status> ret_status(num_keys);
+    std::array<std::string, 10> keys = {"0", "1", "2", "3", "4",
+                                        "5", "6", "7", "8", "9"};
+    size_t num_prefixes = keys.size();
+    for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) {
+      std::vector<Slice> key_slices;
+      std::vector<PinnableSlice> values(num_prefixes);
+      std::vector<Status> statuses(num_prefixes);
+      ReadOptions readoptionscopy = readoptions;
+      readoptionscopy.snapshot = db_->GetSnapshot();
+      std::vector<std::string> key_str;
+      key_str.reserve(num_prefixes);
+      key_slices.reserve(num_prefixes);
+      std::string from_db;
+      ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+      for (size_t key = 0; key < num_prefixes; ++key) {
+        key_str.emplace_back(keys[key] + Key(rand_keys[rand_key]));
+        key_slices.emplace_back(key_str.back());
+      }
+      db_->MultiGet(readoptionscopy, cfh, num_prefixes, key_slices.data(),
+                    values.data(), statuses.data());
+      for (size_t i = 0; i < num_prefixes; i++) {
+        Status s = statuses[i];
+        if (!s.ok() && !s.IsNotFound()) {
+          fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+          thread->stats.AddErrors(1);
+          ret_status[rand_key] = s;
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        } else if (s.IsNotFound()) {
+          thread->stats.AddGets(1, 0);
+          ret_status[rand_key] = s;
+        } else {
+          char expected_prefix = (keys[i])[0];
+          char actual_prefix = (values[i])[0];
+          if (actual_prefix != expected_prefix) {
+            fprintf(stderr, "error expected prefix = %c actual = %c\n",
+                    expected_prefix, actual_prefix);
+          }
+          std::string str;
+          str.assign(values[i].data(), values[i].size());
+          values[i].Reset();
+          str[0] = ' ';  // blank out the differing character
+          values[i].PinSelf(str);
+          thread->stats.AddGets(1, 1);
+        }
+      }
+      db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+      // Now that we retrieved all values, check that they all match
+      for (size_t i = 1; i < num_prefixes; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
+                  key_str[i].c_str(), StringToHex(values[0].ToString()).c_str(),
+                  StringToHex(values[i].ToString()).c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+      }
+    }
+
+    return ret_status;
+  }
+
+  // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
+  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
+  // of the key. Each of these 10 scans returns a series of values;
+  // each series should be the same length, and it is verified for each
+  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that MultiPut was used to put (K, V)
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& readoptions,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    size_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 7 : static_cast<size_t>(FLAGS_prefix_size);
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string prefixes[10] = {"0", "1", "2", "3", "4",
+                                "5", "6", "7", "8", "9"};
+    Slice prefix_slices[10];
+    ReadOptions readoptionscopy[10];
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Iterator* iters[10];
+    std::string upper_bounds[10];
+    Slice ub_slices[10];
+    Status s = Status::OK();
+    for (int i = 0; i < 10; i++) {
+      prefixes[i] += key.ToString();
+      prefixes[i].resize(prefix_to_use);
+      prefix_slices[i] = Slice(prefixes[i]);
+      readoptionscopy[i] = readoptions;
+      readoptionscopy[i].snapshot = snapshot;
+      if (thread->rand.OneIn(2) &&
+          GetNextPrefix(prefix_slices[i], &(upper_bounds[i]))) {
+        // For half of the time, set the upper bound to the next prefix
+        ub_slices[i] = Slice(upper_bounds[i]);
+        readoptionscopy[i].iterate_upper_bound = &(ub_slices[i]);
+      }
+      iters[i] = db_->NewIterator(readoptionscopy[i], cfh);
+      iters[i]->Seek(prefix_slices[i]);
+    }
+
+    long count = 0;
+    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
+      count++;
+      std::string values[10];
+      // get list of all values for this iteration
+      for (int i = 0; i < 10; i++) {
+        // no iterator should finish before the first one
+        assert(iters[i]->Valid() &&
+               iters[i]->key().starts_with(prefix_slices[i]));
+        values[i] = iters[i]->value().ToString();
+
+        char expected_first = (prefixes[i])[0];
+        char actual_first = (values[i])[0];
+
+        if (actual_first != expected_first) {
+          fprintf(stderr, "error expected first = %c actual = %c\n",
+                  expected_first, actual_first);
+        }
+        (values[i])[0] = ' ';  // blank out the differing character
+      }
+      // make sure all values are equivalent
+      for (int i = 0; i < 10; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr,
+                  "error : %d, inconsistent values for prefix %s: %s, %s\n", i,
+                  prefixes[i].c_str(), StringToHex(values[0]).c_str(),
+                  StringToHex(values[i]).c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+        iters[i]->Next();
+      }
+    }
+
+    // cleanup iterators and snapshot
+    for (int i = 0; i < 10; i++) {
+      // if the first iterator finished, they should have all finished
+      assert(!iters[i]->Valid() ||
+             !iters[i]->key().starts_with(prefix_slices[i]));
+      assert(iters[i]->status().ok());
+      delete iters[i];
+    }
+    db_->ReleaseSnapshot(snapshot);
+
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+
+    return s;
+  }
+
+  virtual void VerifyDb(ThreadState* /* thread */) const {}
+};
+
+class CfConsistencyStressTest : public StressTest {
+ public:
+  CfConsistencyStressTest() : batch_id_(0) {}
+
+  virtual ~CfConsistencyStressTest() {}
+
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& /* read_opts */,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys,
+                         char (&value)[100],
+                         std::unique_ptr<MutexLock>& /* lock */) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    uint64_t value_base = batch_id_.fetch_add(1);
+    size_t sz =
+        GenerateValue(static_cast<uint32_t>(value_base), value, sizeof(value));
+    Slice v(value, sz);
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[cf];
+      if (FLAGS_use_merge) {
+        batch.Merge(cfh, key, v);
+      } else { /* !FLAGS_use_merge */
+        batch.Put(cfh, key, v);
+      }
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multi put or merge error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      auto num = static_cast<long>(rand_column_families.size());
+      thread->stats.AddBytesForWrites(num, (sz + 1) * num);
+    }
+
+    return s;
+  }
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& /* lock */) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[cf];
+      batch.Delete(cfh, key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidel error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys,
+                                 std::unique_ptr<MutexLock>& /* lock */) {
+    int64_t rand_key = rand_keys[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    if (rand_key > max_key - FLAGS_range_deletion_width) {
+      rand_key =
+          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
+    }
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    std::string end_key_str = Key(rand_key + FLAGS_range_deletion_width);
+    Slice end_key = end_key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[rand_column_families[cf]];
+      batch.DeleteRange(cfh, key, end_key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multi del range error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddRangeDeletions(
+          static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "CfConsistencyStressTest does not support TestIngestExternalFile "
+            "because it's not possible to verify the result\n");
+    std::terminate();
+  }
+
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    Status s;
+    bool is_consistent = true;
+
+    if (thread->rand.OneIn(2)) {
+      // 1/2 chance, does a random read from random CF
+      auto cfh =
+          column_families_[rand_column_families[thread->rand.Next() %
+                                                rand_column_families.size()]];
+      std::string from_db;
+      s = db_->Get(readoptions, cfh, key, &from_db);
+    } else {
+      // 1/2 chance, comparing one key is the same across all CFs
+      const Snapshot* snapshot = db_->GetSnapshot();
+      ReadOptions readoptionscopy = readoptions;
+      readoptionscopy.snapshot = snapshot;
+
+      std::string value0;
+      s = db_->Get(readoptionscopy, column_families_[rand_column_families[0]],
+                   key, &value0);
+      if (s.ok() || s.IsNotFound()) {
+        bool found = s.ok();
+        for (size_t i = 1; i < rand_column_families.size(); i++) {
+          std::string value1;
+          s = db_->Get(readoptionscopy,
+                       column_families_[rand_column_families[i]], key, &value1);
+          if (!s.ok() && !s.IsNotFound()) {
+            break;
+          }
+          if (!found && s.ok()) {
+            fprintf(stderr, "Get() return different results with key %s\n",
+                    Slice(key_str).ToString(true).c_str());
+            fprintf(stderr, "CF %s is not found\n",
+                    column_family_names_[0].c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[i].c_str(),
+                    Slice(value1).ToString(true).c_str());
+            is_consistent = false;
+          } else if (found && s.IsNotFound()) {
+            fprintf(stderr, "Get() return different results with key %s\n",
+                    Slice(key_str).ToString(true).c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[0].c_str(),
+                    Slice(value0).ToString(true).c_str());
+            fprintf(stderr, "CF %s is not found\n",
+                    column_family_names_[i].c_str());
+            is_consistent = false;
+          } else if (s.ok() && value0 != value1) {
+            fprintf(stderr, "Get() return different results with key %s\n",
+                    Slice(key_str).ToString(true).c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[0].c_str(),
+                    Slice(value0).ToString(true).c_str());
+            fprintf(stderr, "CF %s returns value %s\n",
+                    column_family_names_[i].c_str(),
+                    Slice(value1).ToString(true).c_str());
+            is_consistent = false;
+          }
+          if (!is_consistent) {
+            break;
+          }
+        }
+      }
+
+      db_->ReleaseSnapshot(snapshot);
+    }
+    if (!is_consistent) {
+      thread->stats.AddErrors(1);
+      // Fail fast to preserve the DB state.
+      thread->shared->SetVerificationFailure();
+    } else if (s.ok()) {
+      thread->stats.AddGets(1, 1);
+    } else if (s.IsNotFound()) {
+      thread->stats.AddGets(1, 0);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    return s;
+  }
+
+  virtual std::vector<Status> TestMultiGet(
+      ThreadState* thread, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    size_t num_keys = rand_keys.size();
+    std::vector<std::string> key_str;
+    std::vector<Slice> keys;
+    keys.reserve(num_keys);
+    key_str.reserve(num_keys);
+    std::vector<PinnableSlice> values(num_keys);
+    std::vector<Status> statuses(num_keys);
+    ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]];
+
+    for (size_t i = 0; i < num_keys; ++i) {
+      key_str.emplace_back(Key(rand_keys[i]));
+      keys.emplace_back(key_str.back());
+    }
+    db_->MultiGet(read_opts, cfh, num_keys, keys.data(), values.data(),
+                  statuses.data());
+    for (auto s : statuses) {
+      if (s.ok()) {
+        // found case
+        thread->stats.AddGets(1, 1);
+      } else if (s.IsNotFound()) {
+        // not found case
+        thread->stats.AddGets(1, 0);
+      } else {
+        // errors case
+        thread->stats.AddErrors(1);
+      }
+    }
+    return statuses;
+  }
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& readoptions,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    size_t prefix_to_use =
+        (FLAGS_prefix_size < 0) ? 7 : static_cast<size_t>(FLAGS_prefix_size);
+
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    Slice prefix = Slice(key.data(), prefix_to_use);
+
+    std::string upper_bound;
+    Slice ub_slice;
+    ReadOptions ro_copy = readoptions;
+    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+    auto cfh =
+        column_families_[rand_column_families[thread->rand.Next() %
+                                              rand_column_families.size()]];
+    Iterator* iter = db_->NewIterator(ro_copy, cfh);
+    long count = 0;
+    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
+         iter->Next()) {
+      ++count;
+    }
+    assert(prefix_to_use == 0 ||
+           count <= (static_cast<long>(1) << ((8 - prefix_to_use) * 8)));
+    Status s = iter->status();
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    delete iter;
+    return s;
+  }
+
+  virtual ColumnFamilyHandle* GetControlCfh(ThreadState* thread,
+                                            int /*column_family_id*/
+  ) {
+    // All column families should contain the same data. Randomly pick one.
+    return column_families_[thread->rand.Next() % column_families_.size()];
+  }
+
+#ifdef ROCKSDB_LITE
+  virtual Status TestCheckpoint(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestCheckpoint\n");
+    std::terminate();
+  }
+#else
+  virtual Status TestCheckpoint(
+      ThreadState* thread, const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    std::string checkpoint_dir =
+        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
+    DestroyDB(checkpoint_dir, options_);
+    Checkpoint* checkpoint = nullptr;
+    Status s = Checkpoint::Create(db_, &checkpoint);
+    if (s.ok()) {
+      s = checkpoint->CreateCheckpoint(checkpoint_dir);
+    }
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* checkpoint_db = nullptr;
+    if (s.ok()) {
+      delete checkpoint;
+      checkpoint = nullptr;
+      Options options(options_);
+      options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      if (FLAGS_clear_column_family_one_in == 0) {
+        for (const auto& name : column_family_names_) {
+          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
+        }
+        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
+                                &cf_handles, &checkpoint_db);
+      }
+    }
+    if (checkpoint_db != nullptr) {
+      for (auto cfh : cf_handles) {
+        delete cfh;
+      }
+      cf_handles.clear();
+      delete checkpoint_db;
+      checkpoint_db = nullptr;
+    }
+    DestroyDB(checkpoint_dir, options_);
+    if (!s.ok()) {
+      fprintf(stderr, "A checkpoint operation failed with: %s\n",
+              s.ToString().c_str());
+    }
+    return s;
+  }
+#endif  // !ROCKSDB_LITE
+
+  virtual void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    // We must set total_order_seek to true because we are doing a SeekToFirst
+    // on a column family whose memtables may support (by default) prefix-based
+    // iterator. In this case, NewIterator with options.total_order_seek being
+    // false returns a prefix-based iterator. Calling SeekToFirst using this
+    // iterator causes the iterator to become invalid. That means we cannot
+    // iterate the memtable using this iterator any more, although the memtable
+    // contains the most up-to-date key-values.
+    options.total_order_seek = true;
+    assert(thread != nullptr);
+    auto shared = thread->shared;
+    std::vector<std::unique_ptr<Iterator>> iters(column_families_.size());
+    for (size_t i = 0; i != column_families_.size(); ++i) {
+      iters[i].reset(db_->NewIterator(options, column_families_[i]));
+    }
+    for (auto& iter : iters) {
+      iter->SeekToFirst();
+    }
+    size_t num = column_families_.size();
+    assert(num == iters.size());
+    std::vector<Status> statuses(num, Status::OK());
+    do {
+      if (shared->HasVerificationFailedYet()) {
+        break;
+      }
+      size_t valid_cnt = 0;
+      size_t idx = 0;
+      for (auto& iter : iters) {
+        if (iter->Valid()) {
+          ++valid_cnt;
+        } else {
+          statuses[idx] = iter->status();
+        }
+        ++idx;
+      }
+      if (valid_cnt == 0) {
+        Status status;
+        for (size_t i = 0; i != num; ++i) {
+          const auto& s = statuses[i];
+          if (!s.ok()) {
+            status = s;
+            fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                    column_families_[i]->GetName().c_str(),
+                    s.ToString().c_str());
+            shared->SetVerificationFailure();
+          }
+        }
+        if (status.ok()) {
+          fprintf(stdout, "Finished scanning all column families.\n");
+        }
+        break;
+      } else if (valid_cnt != iters.size()) {
+        shared->SetVerificationFailure();
+        for (size_t i = 0; i != num; ++i) {
+          if (!iters[i]->Valid()) {
+            if (statuses[i].ok()) {
+              fprintf(stderr, "Finished scanning cf %s\n",
+                      column_families_[i]->GetName().c_str());
+            } else {
+              fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                      column_families_[i]->GetName().c_str(),
+                      statuses[i].ToString().c_str());
+            }
+          } else {
+            fprintf(stderr, "cf %s has remaining data to scan\n",
+                    column_families_[i]->GetName().c_str());
+          }
+        }
+        break;
+      }
+      if (shared->HasVerificationFailedYet()) {
+        break;
+      }
+      // If the program reaches here, then all column families' iterators are
+      // still valid.
+      if (shared->PrintingVerificationResults()) {
+        continue;
+      }
+      Slice key;
+      Slice value;
+      int num_mismatched_cfs = 0;
+      for (size_t i = 0; i != num; ++i) {
+        if (i == 0) {
+          key = iters[i]->key();
+          value = iters[i]->value();
+        } else {
+          int cmp = key.compare(iters[i]->key());
+          if (cmp != 0) {
+            ++num_mismatched_cfs;
+            if (1 == num_mismatched_cfs) {
+              fprintf(stderr, "Verification failed\n");
+              fprintf(stderr, "Latest Sequence Number: %" PRIu64 "\n",
+                      db_->GetLatestSequenceNumber());
+              fprintf(stderr, "[%s] %s => %s\n",
+                      column_families_[0]->GetName().c_str(),
+                      key.ToString(true /* hex */).c_str(),
+                      value.ToString(true /* hex */).c_str());
+            }
+            fprintf(stderr, "[%s] %s => %s\n",
+                    column_families_[i]->GetName().c_str(),
+                    iters[i]->key().ToString(true /* hex */).c_str(),
+                    iters[i]->value().ToString(true /* hex */).c_str());
+#ifndef ROCKSDB_LITE
+            Slice begin_key;
+            Slice end_key;
+            if (cmp < 0) {
+              begin_key = key;
+              end_key = iters[i]->key();
+            } else {
+              begin_key = iters[i]->key();
+              end_key = key;
+            }
+            std::vector<KeyVersion> versions;
+            const size_t kMaxNumIKeys = 8;
+            const auto print_key_versions = [&](ColumnFamilyHandle* cfh) {
+              Status s = GetAllKeyVersions(db_, cfh, begin_key, end_key,
+                                           kMaxNumIKeys, &versions);
+              if (!s.ok()) {
+                fprintf(stderr, "%s\n", s.ToString().c_str());
+                return;
+              }
+              assert(nullptr != cfh);
+              fprintf(stderr,
+                      "Internal keys in CF '%s', [%s, %s] (max %" ROCKSDB_PRIszt
+                      ")\n",
+                      cfh->GetName().c_str(),
+                      begin_key.ToString(true /* hex */).c_str(),
+                      end_key.ToString(true /* hex */).c_str(), kMaxNumIKeys);
+              for (const KeyVersion& kv : versions) {
+                fprintf(stderr, "  key %s seq %" PRIu64 " type %d\n",
+                        Slice(kv.user_key).ToString(true).c_str(), kv.sequence,
+                        kv.type);
+              }
+            };
+            if (1 == num_mismatched_cfs) {
+              print_key_versions(column_families_[0]);
+            }
+            print_key_versions(column_families_[i]);
+#endif  // ROCKSDB_LITE
+            shared->SetVerificationFailure();
+          }
+        }
+      }
+      shared->FinishPrintingVerificationResults();
+      for (auto& iter : iters) {
+        iter->Next();
+      }
+    } while (true);
+  }
+
+  virtual std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */, int /* rand_column_family */) const {
+    std::vector<int> ret;
+    int num = static_cast<int>(column_families_.size());
+    int k = 0;
+    std::generate_n(back_inserter(ret), num, [&k]() -> int { return k++; });
+    return ret;
+  }
+
+ private:
+  std::atomic<int64_t> batch_id_;
+};
+
+int db_stress_tool(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_statistics) {
+    dbstats = rocksdb::CreateDBStatistics();
+    if (FLAGS_enable_secondary) {
+      dbstats_secondaries = rocksdb::CreateDBStatistics();
+    }
+  }
+  FLAGS_compression_type_e =
+      StringToCompressionType(FLAGS_compression_type.c_str());
+  FLAGS_checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
+  if (!FLAGS_hdfs.empty()) {
+    if (!FLAGS_env_uri.empty()) {
+      fprintf(stderr, "Cannot specify both --hdfs and --env_uri.\n");
+      exit(1);
+    }
+    FLAGS_env = new rocksdb::HdfsEnv(FLAGS_hdfs);
+  } else if (!FLAGS_env_uri.empty()) {
+    Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
+    if (FLAGS_env == nullptr) {
+      fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
+      exit(1);
+    }
+  }
+  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+  // The number of background threads should be at least as much the
+  // max number of concurrent compactions.
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+  FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
+                                  rocksdb::Env::Priority::BOTTOM);
+  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size < 0) {
+    fprintf(stderr,
+            "Error: prefixpercent is non-zero while prefix_size is "
+            "not positive!\n");
+    exit(1);
+  }
+  if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
+    fprintf(stderr,
+            "Error: please specify prefix_size for "
+            "test_batches_snapshots test!\n");
+    exit(1);
+  }
+  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0) {
+    fprintf(stderr,
+            "Error: please specify positive prefix_size in order to use "
+            "memtable_prefix_bloom_size_ratio\n");
+    exit(1);
+  }
+  if ((FLAGS_readpercent + FLAGS_prefixpercent + FLAGS_writepercent +
+       FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent) != 100) {
+    fprintf(stderr,
+            "Error: Read+Prefix+Write+Delete+DeleteRange+Iterate percents != "
+            "100!\n");
+    exit(1);
+  }
+  if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
+    fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
+    exit(1);
+  }
+  if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
+    fprintf(stderr,
+            "Error: #DB-reopens should be < ops_per_thread\n"
+            "Provided reopens = %d and ops_per_thread = %lu\n",
+            FLAGS_reopen, (unsigned long)FLAGS_ops_per_thread);
+    exit(1);
+  }
+  if (FLAGS_test_batches_snapshots && FLAGS_delrangepercent > 0) {
+    fprintf(stderr,
+            "Error: nonzero delrangepercent unsupported in "
+            "test_batches_snapshots mode\n");
+    exit(1);
+  }
+  if (FLAGS_active_width > FLAGS_max_key) {
+    fprintf(stderr, "Error: active_width can be at most max_key\n");
+    exit(1);
+  } else if (FLAGS_active_width == 0) {
+    FLAGS_active_width = FLAGS_max_key;
+  }
+  if (FLAGS_value_size_mult * kRandomValueMaxFactor > kValueMaxLen) {
+    fprintf(stderr, "Error: value_size_mult can be at most %d\n",
+            kValueMaxLen / kRandomValueMaxFactor);
+    exit(1);
+  }
+  if (FLAGS_use_merge && FLAGS_nooverwritepercent == 100) {
+    fprintf(
+        stderr,
+        "Error: nooverwritepercent must not be 100 when using merge operands");
+    exit(1);
+  }
+  if (FLAGS_ingest_external_file_one_in > 0 && FLAGS_nooverwritepercent > 0) {
+    fprintf(stderr,
+            "Error: nooverwritepercent must be 0 when using file ingestion\n");
+    exit(1);
+  }
+  if (FLAGS_clear_column_family_one_in > 0 && FLAGS_backup_one_in > 0) {
+    fprintf(stderr,
+            "Error: clear_column_family_one_in must be 0 when using backup\n");
+    exit(1);
+  }
+  if (FLAGS_test_cf_consistency && FLAGS_disable_wal) {
+    FLAGS_atomic_flush = true;
+  }
+
+  if (FLAGS_read_only) {
+    if (FLAGS_writepercent != 0 || FLAGS_delpercent != 0 ||
+        FLAGS_delrangepercent != 0) {
+      fprintf(stderr, "Error: updates are not supported in read only mode\n");
+      exit(1);
+    } else if (FLAGS_checkpoint_one_in > 0 &&
+               FLAGS_clear_column_family_one_in > 0) {
+      fprintf(stdout,
+              "Warn: checkpoint won't be validated since column families may "
+              "be dropped.\n");
+    }
+  }
+
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db.empty()) {
+    std::string default_db_path;
+    FLAGS_env->GetTestDirectory(&default_db_path);
+    default_db_path += "/dbstress";
+    FLAGS_db = default_db_path;
+  }
+
+  if (FLAGS_enable_secondary && FLAGS_secondaries_base.empty()) {
+    std::string default_secondaries_path;
+    FLAGS_env->GetTestDirectory(&default_secondaries_path);
+    default_secondaries_path += "/dbstress_secondaries";
+    rocksdb::Status s = FLAGS_env->CreateDirIfMissing(default_secondaries_path);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to create directory %s: %s\n",
+              default_secondaries_path.c_str(), s.ToString().c_str());
+      exit(1);
+    }
+    FLAGS_secondaries_base = default_secondaries_path;
+  }
+
+  if (!FLAGS_enable_secondary && FLAGS_secondary_catch_up_one_in > 0) {
+    fprintf(stderr, "Secondary instance is disabled.\n");
+    exit(1);
+  }
+
+  rocksdb_kill_odds = FLAGS_kill_random_test;
+  rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
+
+  std::unique_ptr<rocksdb::StressTest> stress;
+  if (FLAGS_test_cf_consistency) {
+    stress.reset(new rocksdb::CfConsistencyStressTest());
+  } else if (FLAGS_test_batches_snapshots) {
+    stress.reset(new rocksdb::BatchedOpsStressTest());
+  } else {
+    stress.reset(new rocksdb::NonBatchedOpsStressTest());
+  }
+  if (stress->Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+}  // namespace rocksdb
+#endif  // GFLAGS
diff --git a/tools/dump/db_dump_tool.cc b/tools/dump/db_dump_tool.cc
index 8c5fa82e5b9..06a47ce725b 100644
--- a/tools/dump/db_dump_tool.cc
+++ b/tools/dump/db_dump_tool.cc
@@ -5,11 +5,7 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <iostream>
 
 #include "rocksdb/db.h"
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index d05ae4a5810..9ff9bb0e9dd 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -1,3 +1,4 @@
+
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
@@ -6,23 +7,19 @@
 #ifndef ROCKSDB_LITE
 #include "rocksdb/utilities/ldb_cmd.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/log_reader.h"
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
 #include "port/port_dirent.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/utilities/backupable_db.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/debug.h"
-#include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/options_util.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/write_buffer_manager.h"
@@ -31,9 +28,9 @@
 #include "tools/sst_dump_tool_imp.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
-#include "util/filename.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
+#include "utilities/merge_operators.h"
 #include "utilities/ttl/db_ttl_impl.h"
 
 #include <cstdlib>
@@ -48,8 +45,10 @@
 
 namespace rocksdb {
 
+const std::string LDBCommand::ARG_ENV_URI = "env_uri";
 const std::string LDBCommand::ARG_DB = "db";
 const std::string LDBCommand::ARG_PATH = "path";
+const std::string LDBCommand::ARG_SECONDARY_PATH = "secondary_path";
 const std::string LDBCommand::ARG_HEX = "hex";
 const std::string LDBCommand::ARG_KEY_HEX = "key_hex";
 const std::string LDBCommand::ARG_VALUE_HEX = "value_hex";
@@ -226,6 +225,10 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
     return new CreateColumnFamilyCommand(parsed_params.cmd_params,
                                          parsed_params.option_map,
                                          parsed_params.flags);
+  } else if (parsed_params.cmd == DropColumnFamilyCommand::Name()) {
+    return new DropColumnFamilyCommand(parsed_params.cmd_params,
+                                       parsed_params.option_map,
+                                       parsed_params.flags);
   } else if (parsed_params.cmd == DBFileDumperCommand::Name()) {
     return new DBFileDumperCommand(parsed_params.cmd_params,
                                    parsed_params.option_map,
@@ -259,6 +262,9 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
     return new IngestExternalSstFilesCommand(parsed_params.cmd_params,
                                              parsed_params.option_map,
                                              parsed_params.flags);
+  } else if (parsed_params.cmd == ListFileRangeDeletesCommand::Name()) {
+    return new ListFileRangeDeletesCommand(parsed_params.option_map,
+                                           parsed_params.flags);
   }
   return nullptr;
 }
@@ -269,6 +275,17 @@ void LDBCommand::Run() {
     return;
   }
 
+  if (!options_.env || options_.env == Env::Default()) {
+    Env* env = Env::Default();
+    Status s = Env::LoadEnv(env_uri_, &env, &env_guard_);
+    if (!s.ok() && !s.IsNotFound()) {
+      fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+      exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+      return;
+    }
+    options_.env = env;
+  }
+
   if (db_ == nullptr && !NoDBOpen()) {
     OpenDB();
     if (exec_state_.IsFailed() && try_load_options_) {
@@ -313,6 +330,11 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
     db_path_ = itr->second;
   }
 
+  itr = options.find(ARG_ENV_URI);
+  if (itr != options.end()) {
+    env_uri_ = itr->second;
+  }
+
   itr = options.find(ARG_CF_NAME);
   if (itr != options.end()) {
     column_family_name_ = itr->second;
@@ -320,6 +342,12 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
     column_family_name_ = kDefaultColumnFamilyName;
   }
 
+  itr = options.find(ARG_SECONDARY_PATH);
+  secondary_path_ = "";
+  if (itr != options.end()) {
+    secondary_path_ = itr->second;
+  }
+
   is_key_hex_ = IsKeyHex(options, flags);
   is_value_hex_ = IsValueHex(options, flags);
   is_db_ttl_ = IsFlagPresent(flags, ARG_TTL);
@@ -330,7 +358,7 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
 
 void LDBCommand::OpenDB() {
   if (!create_if_missing_ && try_load_options_) {
-    Status s = LoadLatestOptions(db_path_, Env::Default(), &options_,
+    Status s = LoadLatestOptions(db_path_, options_.env, &options_,
                                  &column_families_, ignore_unknown_options_);
     if (!s.ok() && !s.IsNotFound()) {
       // Option file exists but load option file error.
@@ -345,11 +373,24 @@ void LDBCommand::OpenDB() {
           stderr,
           "wal_dir loaded from the option file doesn't exist. Ignore it.\n");
     }
+
+    // If merge operator is not set, set a string append operator. There is
+    // no harm doing it.
+    for (auto& cf_entry : column_families_) {
+      if (!cf_entry.options.merge_operator) {
+        cf_entry.options.merge_operator =
+            MergeOperators::CreateStringAppendOperator(':');
+      }
+    }
   }
   options_ = PrepareOptionsForOpenDB();
   if (!exec_state_.IsNotStarted()) {
     return;
   }
+  if (column_families_.empty() && !options_.merge_operator) {
+    // No harm to add a general merge operator if it is not specified.
+    options_.merge_operator = MergeOperators::CreateStringAppendOperator(':');
+  }
   // Open the DB.
   Status st;
   std::vector<ColumnFamilyHandle*> handles_opened;
@@ -359,6 +400,10 @@ void LDBCommand::OpenDB() {
       exec_state_ = LDBCommandExecuteResult::Failed(
           "ldb doesn't support TTL DB with multiple column families");
     }
+    if (!secondary_path_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Open as secondary is not supported for TTL DB yet.");
+    }
     if (is_read_only_) {
       st = DBWithTTL::Open(options_, db_path_, &db_ttl_, 0, true);
     } else {
@@ -369,7 +414,7 @@ void LDBCommand::OpenDB() {
     if (column_families_.empty()) {
       // Try to figure out column family lists
       std::vector<std::string> cf_list;
-      st = DB::ListColumnFamilies(DBOptions(), db_path_, &cf_list);
+      st = DB::ListColumnFamilies(options_, db_path_, &cf_list);
       // There is possible the DB doesn't exist yet, for "create if not
       // "existing case". The failure is ignored here. We rely on DB::Open()
       // to give us the correct error message for problem with opening
@@ -381,7 +426,7 @@ void LDBCommand::OpenDB() {
         }
       }
     }
-    if (is_read_only_) {
+    if (is_read_only_ && secondary_path_.empty()) {
       if (column_families_.empty()) {
         st = DB::OpenForReadOnly(options_, db_path_, &db_);
       } else {
@@ -390,10 +435,19 @@ void LDBCommand::OpenDB() {
       }
     } else {
       if (column_families_.empty()) {
-        st = DB::Open(options_, db_path_, &db_);
+        if (secondary_path_.empty()) {
+          st = DB::Open(options_, db_path_, &db_);
+        } else {
+          st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, &db_);
+        }
       } else {
-        st = DB::Open(options_, db_path_, column_families_, &handles_opened,
-                      &db_);
+        if (secondary_path_.empty()) {
+          st = DB::Open(options_, db_path_, column_families_, &handles_opened,
+                        &db_);
+        } else {
+          st = DB::OpenAsSecondary(options_, db_path_, secondary_path_,
+                                   column_families_, &handles_opened, &db_);
+        }
       }
     }
   }
@@ -450,7 +504,9 @@ ColumnFamilyHandle* LDBCommand::GetCfHandle() {
 
 std::vector<std::string> LDBCommand::BuildCmdLineOptions(
     std::vector<std::string> options) {
-  std::vector<std::string> ret = {ARG_DB,
+  std::vector<std::string> ret = {ARG_ENV_URI,
+                                  ARG_DB,
+                                  ARG_SECONDARY_PATH,
                                   ARG_BLOOM_BITS,
                                   ARG_BLOCK_SIZE,
                                   ARG_AUTO_COMPACTION,
@@ -858,8 +914,7 @@ void CompactorCommand::DoCommand() {
   delete end;
 }
 
-// ----------------------------------------------------------------------------
-
+// ---------------------------------------------------------------------------
 const std::string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal";
 const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
 const std::string DBLoaderCommand::ARG_COMPACT = "compact";
@@ -957,7 +1012,8 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
   WriteController wc(options.delayed_write_rate);
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
-  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc);
+  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr);
   Status s = versions.DumpManifest(options, file, verbose, hex, json);
   if (!s.ok()) {
     printf("Error in processing file %s %s\n", file.c_str(),
@@ -1057,31 +1113,23 @@ void ManifestDumpCommand::DoCommand() {
 void ListColumnFamiliesCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(ListColumnFamiliesCommand::Name());
-  ret.append(" full_path_to_db_directory ");
   ret.append("\n");
 }
 
 ListColumnFamiliesCommand::ListColumnFamiliesCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, {}) {
-  if (params.size() != 1) {
-    exec_state_ = LDBCommandExecuteResult::Failed(
-        "dbname must be specified for the list_column_families command");
-  } else {
-    dbname_ = params[0];
-  }
-}
+    : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
 
 void ListColumnFamiliesCommand::DoCommand() {
   std::vector<std::string> column_families;
-  Status s = DB::ListColumnFamilies(DBOptions(), dbname_, &column_families);
+  Status s = DB::ListColumnFamilies(options_, db_path_, &column_families);
   if (!s.ok()) {
-    printf("Error in processing db %s %s\n", dbname_.c_str(),
+    printf("Error in processing db %s %s\n", db_path_.c_str(),
            s.ToString().c_str());
   } else {
-    printf("Column families in %s: \n{", dbname_.c_str());
+    printf("Column families in %s: \n{", db_path_.c_str());
     bool first = true;
     for (auto cf : column_families) {
       if (!first) {
@@ -1127,20 +1175,47 @@ void CreateColumnFamilyCommand::DoCommand() {
   CloseDB();
 }
 
-// ----------------------------------------------------------------------------
+void DropColumnFamilyCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(DropColumnFamilyCommand::Name());
+  ret.append(" --db=<db_path> <column_family_name_to_drop>");
+  ret.append("\n");
+}
 
-namespace {
+DropColumnFamilyCommand::DropColumnFamilyCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, {ARG_DB}) {
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "The name of column family to drop must be specified");
+  } else {
+    cf_name_to_drop_ = params[0];
+  }
+}
 
-std::string ReadableTime(int unixtime) {
-  char time_buffer [80];
-  time_t rawtime = unixtime;
-  struct tm tInfo;
-  struct tm* timeinfo = localtime_r(&rawtime, &tInfo);
-  assert(timeinfo == &tInfo);
-  strftime(time_buffer, 80, "%c", timeinfo);
-  return std::string(time_buffer);
+void DropColumnFamilyCommand::DoCommand() {
+  auto iter = cf_handles_.find(cf_name_to_drop_);
+  if (iter == cf_handles_.end()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Column family: " + cf_name_to_drop_ + " doesn't exist in db.");
+    return;
+  }
+  ColumnFamilyHandle* cf_handle_to_drop = iter->second;
+  Status st = db_->DropColumnFamily(cf_handle_to_drop);
+  if (st.ok()) {
+    fprintf(stdout, "OK\n");
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Fail to drop column family: " + st.ToString());
+  }
+  CloseDB();
 }
 
+// ----------------------------------------------------------------------------
+namespace {
+
 // This function only called when it's the sane case of >1 buckets in time-range
 // Also called only when timekv falls between ttl_start and ttl_end provided
 void IncBucketCounts(std::vector<uint64_t>& bucket_counts, int ttl_start,
@@ -1162,13 +1237,13 @@ void PrintBucketCounts(const std::vector<uint64_t>& bucket_counts,
   int time_point = ttl_start;
   for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) {
     fprintf(stdout, "Keys in range %s to %s : %lu\n",
-            ReadableTime(time_point).c_str(),
-            ReadableTime(time_point + bucket_size).c_str(),
+            TimeToHumanString(time_point).c_str(),
+            TimeToHumanString(time_point + bucket_size).c_str(),
             (unsigned long)bucket_counts[i]);
   }
   fprintf(stdout, "Keys in range %s to %s : %lu\n",
-          ReadableTime(time_point).c_str(),
-          ReadableTime(ttl_end).c_str(),
+          TimeToHumanString(time_point).c_str(),
+          TimeToHumanString(ttl_end).c_str(),
           (unsigned long)bucket_counts[num_buckets - 1]);
 }
 
@@ -1252,7 +1327,8 @@ void InternalDumpCommand::DoCommand() {
 
   // Cast as DBImpl to get internal iterator
   std::vector<KeyVersion> key_versions;
-  Status st = GetAllKeyVersions(db_, from_, to_, max_keys_, &key_versions);
+  Status st = GetAllKeyVersions(db_, GetCfHandle(), from_, to_, max_keys_,
+                                &key_versions);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
@@ -1524,7 +1600,8 @@ void DBDumperCommand::DoDumpCommand() {
   std::vector<uint64_t> bucket_counts(num_buckets, 0);
   if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) {
     fprintf(stdout, "Dumping key-values from %s to %s\n",
-            ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
+            TimeToHumanString(ttl_start).c_str(),
+            TimeToHumanString(ttl_end).c_str());
   }
 
   HistogramImpl vsize_hist;
@@ -1579,7 +1656,7 @@ void DBDumperCommand::DoDumpCommand() {
 
     if (!count_only_ && !count_delim_) {
       if (is_db_ttl_ && timestamp_) {
-        fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
+        fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str());
       }
       std::string str =
           PrintKeyValue(iter->key().ToString(), iter->value().ToString(),
@@ -1667,7 +1744,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   const InternalKeyComparator cmp(opt.comparator);
   WriteController wc(opt.delayed_write_rate);
   WriteBufferManager wb(opt.db_write_buffer_size);
-  VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc);
+  VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
+                      /*block_cache_tracer=*/nullptr);
   std::vector<ColumnFamilyDescriptor> dummy;
   ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
                                           ColumnFamilyOptions(opt));
@@ -2356,7 +2434,8 @@ void ScanCommand::DoCommand() {
   }
   if (is_db_ttl_ && timestamp_) {
     fprintf(stdout, "Scanning key-values from %s to %s\n",
-            ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
+            TimeToHumanString(ttl_start).c_str(),
+            TimeToHumanString(ttl_end).c_str());
   }
   for ( ;
         it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_);
@@ -2368,7 +2447,7 @@ void ScanCommand::DoCommand() {
         continue;
       }
       if (timestamp_) {
-        fprintf(stdout, "%s ", ReadableTime(rawtime).c_str());
+        fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str());
       }
     }
 
@@ -2787,13 +2866,15 @@ void BackupCommand::DoCommand() {
     return;
   }
   printf("open db OK\n");
-  std::unique_ptr<Env> custom_env_guard;
-  Env* custom_env = NewCustomObject<Env>(backup_env_uri_, &custom_env_guard);
+  Env* custom_env = nullptr;
+  Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+  assert(custom_env != nullptr);
+
   BackupableDBOptions backup_options =
       BackupableDBOptions(backup_dir_, custom_env);
   backup_options.info_log = logger_.get();
   backup_options.max_background_operations = num_threads_;
-  status = BackupEngine::Open(Env::Default(), backup_options, &backup_engine);
+  status = BackupEngine::Open(custom_env, backup_options, &backup_engine);
   if (status.ok()) {
     printf("open backup engine OK\n");
   } else {
@@ -2822,8 +2903,10 @@ void RestoreCommand::Help(std::string& ret) {
 }
 
 void RestoreCommand::DoCommand() {
-  std::unique_ptr<Env> custom_env_guard;
-  Env* custom_env = NewCustomObject<Env>(backup_env_uri_, &custom_env_guard);
+  Env* custom_env = nullptr;
+  Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+  assert(custom_env != nullptr);
+
   std::unique_ptr<BackupEngineReadOnly> restore_engine;
   Status status;
   {
@@ -2831,8 +2914,8 @@ void RestoreCommand::DoCommand() {
     opts.info_log = logger_.get();
     opts.max_background_operations = num_threads_;
     BackupEngineReadOnly* raw_restore_engine_ptr;
-    status = BackupEngineReadOnly::Open(Env::Default(), opts,
-                                        &raw_restore_engine_ptr);
+    status =
+        BackupEngineReadOnly::Open(custom_env, opts, &raw_restore_engine_ptr);
     if (status.ok()) {
       restore_engine.reset(raw_restore_engine_ptr);
     }
@@ -2862,7 +2945,9 @@ void DumpSstFile(Options options, std::string filename, bool output_hex,
     return;
   }
   // no verification
-  rocksdb::SstFileDumper dumper(options, filename, false, output_hex);
+  // TODO: add support for decoding blob indexes in ldb as well
+  rocksdb::SstFileDumper dumper(options, filename, /* verify_checksum */ false,
+                                output_hex, /* decode_blob_index */ false);
   Status st = dumper.ReadSequential(true, std::numeric_limits<uint64_t>::max(),
                                     false,            // has_from
                                     from_key, false,  // has_to
@@ -3160,5 +3245,56 @@ Options IngestExternalSstFilesCommand::PrepareOptionsForOpenDB() {
   return opt;
 }
 
+ListFileRangeDeletesCommand::ListFileRangeDeletesCommand(
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_MAX_KEYS})) {
+  std::map<std::string, std::string>::const_iterator itr =
+      options.find(ARG_MAX_KEYS);
+  if (itr != options.end()) {
+    try {
+#if defined(CYGWIN)
+      max_keys_ = strtol(itr->second.c_str(), 0, 10);
+#else
+      max_keys_ = std::stoi(itr->second);
+#endif
+    } catch (const std::invalid_argument&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS +
+                                                    " has an invalid value");
+    } catch (const std::out_of_range&) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          ARG_MAX_KEYS + " has a value out-of-range");
+    }
+  }
+}
+
+void ListFileRangeDeletesCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(ListFileRangeDeletesCommand::Name());
+  ret.append(" [--" + ARG_MAX_KEYS + "=<N>]");
+  ret.append(" : print tombstones in SST files.\n");
+}
+
+void ListFileRangeDeletesCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  DBImpl* db_impl = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
+
+  std::string out_str;
+
+  Status st =
+      db_impl->TablesRangeTombstoneSummary(GetCfHandle(), max_keys_, &out_str);
+  if (st.ok()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "ListFileRangeDeletesCommand::DoCommand:BeforePrint", &out_str);
+    fprintf(stdout, "%s\n", out_str.c_str());
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+  }
+}
+
 }   // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h
index 868c81f44c8..49ade584082 100644
--- a/tools/ldb_cmd_impl.h
+++ b/tools/ldb_cmd_impl.h
@@ -183,9 +183,6 @@ class ListColumnFamiliesCommand : public LDBCommand {
   virtual void DoCommand() override;
 
   virtual bool NoDBOpen() override { return true; }
-
- private:
-  std::string dbname_;
 };
 
 class CreateColumnFamilyCommand : public LDBCommand {
@@ -205,6 +202,23 @@ class CreateColumnFamilyCommand : public LDBCommand {
   std::string new_cf_name_;
 };
 
+class DropColumnFamilyCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "drop_column_family"; }
+
+  DropColumnFamilyCommand(const std::vector<std::string>& params,
+                          const std::map<std::string, std::string>& options,
+                          const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+  virtual void DoCommand() override;
+
+  virtual bool NoDBOpen() override { return false; }
+
+ private:
+  std::string cf_name_to_drop_;
+};
+
 class ReduceDBLevelsCommand : public LDBCommand {
  public:
   static std::string Name() { return "reduce_levels"; }
@@ -493,6 +507,7 @@ class BackupableCommand : public LDBCommand {
   std::string backup_dir_;
   int num_threads_;
   std::unique_ptr<Logger> logger_;
+  std::shared_ptr<Env> backup_env_guard_;
 
  private:
   static const std::string ARG_BACKUP_DIR;
@@ -575,4 +590,20 @@ class IngestExternalSstFilesCommand : public LDBCommand {
   static const std::string ARG_WRITE_GLOBAL_SEQNO;
 };
 
+// Command that prints out range delete tombstones in SST files.
+class ListFileRangeDeletesCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "list_file_range_deletes"; }
+
+  ListFileRangeDeletesCommand(const std::map<std::string, std::string>& options,
+                              const std::vector<std::string>& flags);
+
+  void DoCommand() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  int max_keys_ = 1000;
+};
+
 }  // namespace rocksdb
diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc
index 3b709953373..85afb118325 100644
--- a/tools/ldb_cmd_test.cc
+++ b/tools/ldb_cmd_test.cc
@@ -6,7 +6,9 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/ldb_cmd.h"
-#include "util/testharness.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 
 using std::string;
 using std::vector;
@@ -14,7 +16,23 @@ using std::map;
 
 namespace rocksdb {
 
-class LdbCmdTest : public testing::Test {};
+class LdbCmdTest : public testing::Test {
+ public:
+  LdbCmdTest() : testing::Test() {}
+
+  Env* TryLoadCustomOrDefaultEnv() {
+    const char* test_env_uri = getenv("TEST_ENV_URI");
+    if (!test_env_uri) {
+      return Env::Default();
+    }
+    Env* env = Env::Default();
+    Env::LoadEnv(test_env_uri, &env, &env_guard_);
+    return env;
+  }
+
+ private:
+  std::shared_ptr<Env> env_guard_;
+};
 
 TEST_F(LdbCmdTest, HexToString) {
   // map input to expected outputs.
@@ -50,7 +68,8 @@ TEST_F(LdbCmdTest, HexToStringBadInputs) {
 }
 
 TEST_F(LdbCmdTest, MemEnv) {
-  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
   Options opts;
   opts.env = env.get();
   opts.create_if_missing = true;
@@ -77,19 +96,21 @@ TEST_F(LdbCmdTest, MemEnv) {
   char arg3[] = "dump_live_files";
   char* argv[] = {arg1, arg2, arg3};
 
-  rocksdb::LDBTool tool;
-  tool.Run(3, argv, opts);
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
 }
 
 TEST_F(LdbCmdTest, OptionParsing) {
   // test parsing flags
+  Options opts;
+  opts.env = TryLoadCustomOrDefaultEnv();
   {
     std::vector<std::string> args;
     args.push_back("scan");
     args.push_back("--ttl");
     args.push_back("--timestamp");
     LDBCommand* command = rocksdb::LDBCommand::InitFromCmdLineArgs(
-        args, Options(), LDBOptions(), nullptr);
+        args, opts, LDBOptions(), nullptr);
     const std::vector<std::string> flags = command->TEST_GetFlags();
     EXPECT_EQ(flags.size(), 2);
     EXPECT_EQ(flags[0], "ttl");
@@ -106,7 +127,7 @@ TEST_F(LdbCmdTest, OptionParsing) {
         "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz=a&bcd_"
         "ef=gh.ijk'");
     LDBCommand* command = rocksdb::LDBCommand::InitFromCmdLineArgs(
-        args, Options(), LDBOptions(), nullptr);
+        args, opts, LDBOptions(), nullptr);
     const std::map<std::string, std::string> option_map =
         command->TEST_GetOptionMap();
     EXPECT_EQ(option_map.at("db"), "/dev/shm/ldbtest/");
@@ -118,10 +139,109 @@ TEST_F(LdbCmdTest, OptionParsing) {
   }
 }
 
+TEST_F(LdbCmdTest, ListFileTombstone) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::TmpDir();
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  ASSERT_OK(db->Put(wopts, "foo", "1"));
+  ASSERT_OK(db->Put(wopts, "bar", "2"));
+
+  FlushOptions fopts;
+  fopts.wait = true;
+  ASSERT_OK(db->Flush(fopts));
+
+  ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "foo", "foo2"));
+  ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "bar", "foo2"));
+  ASSERT_OK(db->Flush(fopts));
+
+  delete db;
+
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "list_file_range_deletes";
+    char* argv[] = {arg1, arg2, arg3};
+
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) {
+          std::string* out_str = reinterpret_cast<std::string*>(arg);
+
+          // Count number of tombstones printed
+          int num_tb = 0;
+          const std::string kFingerprintStr = "start: ";
+          auto offset = out_str->find(kFingerprintStr);
+          while (offset != std::string::npos) {
+            num_tb++;
+            offset =
+                out_str->find(kFingerprintStr, offset + kFingerprintStr.size());
+          }
+          EXPECT_EQ(2, num_tb);
+        });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+
+  // Test the case of limiting tombstones
+  {
+    char arg1[] = "./ldb";
+    char arg2[1024];
+    snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+    char arg3[] = "list_file_range_deletes";
+    char arg4[] = "--max_keys=1";
+    char* argv[] = {arg1, arg2, arg3, arg4};
+
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) {
+          std::string* out_str = reinterpret_cast<std::string*>(arg);
+
+          // Count number of tombstones printed
+          int num_tb = 0;
+          const std::string kFingerprintStr = "start: ";
+          auto offset = out_str->find(kFingerprintStr);
+          while (offset != std::string::npos) {
+            num_tb++;
+            offset =
+                out_str->find(kFingerprintStr, offset + kFingerprintStr.size());
+          }
+          EXPECT_EQ(1, num_tb);
+        });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    ASSERT_EQ(
+        0, LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
+
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
 } // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 #else
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index e64e76ee731..74bb7fb1676 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python2
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import os
 import glob
@@ -515,13 +516,12 @@ def testWALDump(self):
 
     def testListColumnFamilies(self):
         print "Running testListColumnFamilies..."
-        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
         self.assertRunOK("put x1 y1 --create_if_missing", "OK")
-        cmd = "list_column_families %s | grep -v \"Column families\""
+        cmd = "list_column_families | grep -v \"Column families\""
         # Test on valid dbPath.
-        self.assertRunOKFull(cmd % dbPath, "{default}")
+        self.assertRunOK(cmd, "{default}")
         # Test on empty path.
-        self.assertRunFAILFull(cmd % "")
+        self.assertRunFAIL(cmd)
 
     def testColumnFamilies(self):
         print "Running testColumnFamilies..."
@@ -553,8 +553,10 @@ def testColumnFamilies(self):
                          "1")
         self.assertRunOK("get cf3_1 --column_family=three",
                          "3")
+        self.assertRunOK("drop_column_family three", "OK")
         # non-existing column family.
         self.assertRunFAIL("get cf3_1 --column_family=four")
+        self.assertRunFAIL("drop_column_family four")
 
     def testIngestExternalSst(self):
         print "Running testIngestExternalSst..."
diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc
index fe307eab7dc..9b5669547a6 100644
--- a/tools/ldb_tool.cc
+++ b/tools/ldb_tool.cc
@@ -21,6 +21,8 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   ret.append("commands MUST specify --" + LDBCommand::ARG_DB +
              "=<full_path_to_db_directory> when necessary\n");
   ret.append("\n");
+  ret.append("commands can optionally specify --" + LDBCommand::ARG_ENV_URI +
+             "=<uri_of_environment> if necessary\n\n");
   ret.append(
       "The following optional parameters control if keys/values are "
       "input/output as hex or as plain strings:\n");
@@ -71,6 +73,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   DBQuerierCommand::Help(ret);
   ApproxSizeCommand::Help(ret);
   CheckConsistencyCommand::Help(ret);
+  ListFileRangeDeletesCommand::Help(ret);
 
   ret.append("\n\n");
   ret.append("Admin Commands:\n");
@@ -82,6 +85,8 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   DBLoaderCommand::Help(ret);
   ManifestDumpCommand::Help(ret);
   ListColumnFamiliesCommand::Help(ret);
+  CreateColumnFamilyCommand::Help(ret);
+  DropColumnFamilyCommand::Help(ret);
   DBFileDumperCommand::Help(ret);
   InternalDumpCommand::Help(ret);
   RepairCommand::Help(ret);
@@ -94,12 +99,12 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   fprintf(stderr, "%s\n", ret.c_str());
 }
 
-void LDBCommandRunner::RunCommand(
+int LDBCommandRunner::RunCommand(
     int argc, char** argv, Options options, const LDBOptions& ldb_options,
     const std::vector<ColumnFamilyDescriptor>* column_families) {
   if (argc <= 2) {
     PrintHelp(ldb_options, argv[0]);
-    exit(1);
+    return 1;
   }
 
   LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(
@@ -107,11 +112,11 @@ void LDBCommandRunner::RunCommand(
   if (cmdObj == nullptr) {
     fprintf(stderr, "Unknown command\n");
     PrintHelp(ldb_options, argv[0]);
-    exit(1);
+    return 1;
   }
 
   if (!cmdObj->ValidateCmdLineOptions()) {
-    exit(1);
+    return 1;
   }
 
   cmdObj->Run();
@@ -119,14 +124,15 @@ void LDBCommandRunner::RunCommand(
   fprintf(stderr, "%s\n", ret.ToString().c_str());
   delete cmdObj;
 
-  exit(ret.IsFailed());
+  return ret.IsFailed() ? 1 : 0;
 }
 
 void LDBTool::Run(int argc, char** argv, Options options,
                   const LDBOptions& ldb_options,
                   const std::vector<ColumnFamilyDescriptor>* column_families) {
-  LDBCommandRunner::RunCommand(argc, argv, options, ldb_options,
-                               column_families);
+  int error_code = LDBCommandRunner::RunCommand(argc, argv, options,
+                                                ldb_options, column_families);
+  exit(error_code);
 }
 } // namespace rocksdb
 
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
index 1718b3344e9..8b23dbf369d 100644
--- a/tools/reduce_levels_test.cc
+++ b/tools/reduce_levels_test.cc
@@ -6,14 +6,14 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/ldb_cmd.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "tools/ldb_cmd_impl.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index 6bf3e3b97a1..0fc72d7385f 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -12,12 +12,13 @@
 #include <stdint.h>
 #include "rocksdb/sst_dump_tool.h"
 
+#include "file/random_access_file_reader.h"
+#include "port/stack_trace.h"
 #include "rocksdb/filter_policy.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/table_builder.h"
-#include "util/file_reader_writer.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 
@@ -85,15 +86,33 @@ void cleanup(const Options& opts, const std::string& file_name) {
 
 // Test for sst dump tool "raw" mode
 class SSTDumpToolTest : public testing::Test {
-  std::string testDir_;
+  std::string test_dir_;
+  Env* env_;
+  std::shared_ptr<Env> env_guard_;
 
  public:
-  SSTDumpToolTest() { testDir_ = test::TmpDir(); }
+  SSTDumpToolTest() : env_(Env::Default()) {
+    const char* test_env_uri = getenv("TEST_ENV_URI");
+    if (test_env_uri) {
+      Env::LoadEnv(test_env_uri, &env_, &env_guard_);
+    }
+    test_dir_ = test::PerThreadDBPath(env_, "sst_dump_test_db");
+    Status s = env_->CreateDirIfMissing(test_dir_);
+    EXPECT_OK(s);
+  }
+
+  ~SSTDumpToolTest() override {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "Data is still at %s\n", test_dir_.c_str());
+    } else {
+      EXPECT_OK(env_->DeleteDir(test_dir_));
+    }
+  }
 
-  ~SSTDumpToolTest() override {}
+  Env* env() { return env_; }
 
   std::string MakeFilePath(const std::string& file_name) const {
-    std::string path(testDir_);
+    std::string path(test_dir_);
     path.append("/").append(file_name);
     return path;
   }
@@ -112,6 +131,7 @@ class SSTDumpToolTest : public testing::Test {
 
 TEST_F(SSTDumpToolTest, EmptyFilter) {
   Options opts;
+  opts.env = env();
   std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
   createSST(opts, file_path);
 
@@ -129,6 +149,7 @@ TEST_F(SSTDumpToolTest, EmptyFilter) {
 
 TEST_F(SSTDumpToolTest, FilterBlock) {
   Options opts;
+  opts.env = env();
   BlockBasedTableOptions table_opts;
   table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
   opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
@@ -149,6 +170,7 @@ TEST_F(SSTDumpToolTest, FilterBlock) {
 
 TEST_F(SSTDumpToolTest, FullFilterBlock) {
   Options opts;
+  opts.env = env();
   BlockBasedTableOptions table_opts;
   table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
   opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
@@ -169,6 +191,7 @@ TEST_F(SSTDumpToolTest, FullFilterBlock) {
 
 TEST_F(SSTDumpToolTest, GetProperties) {
   Options opts;
+  opts.env = env();
   BlockBasedTableOptions table_opts;
   table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
   opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
@@ -189,6 +212,7 @@ TEST_F(SSTDumpToolTest, GetProperties) {
 
 TEST_F(SSTDumpToolTest, CompressedSizes) {
   Options opts;
+  opts.env = env();
   BlockBasedTableOptions table_opts;
   table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
   opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
@@ -208,9 +232,9 @@ TEST_F(SSTDumpToolTest, CompressedSizes) {
 }
 
 TEST_F(SSTDumpToolTest, MemEnv) {
-  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  std::unique_ptr<Env> mem_env(NewMemEnv(env()));
   Options opts;
-  opts.env = env.get();
+  opts.env = mem_env.get();
   std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
   createSST(opts, file_path);
 
@@ -228,8 +252,18 @@ TEST_F(SSTDumpToolTest, MemEnv) {
 
 }  // namespace rocksdb
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 5cbbfc38542..71bec339e0a 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -1,3 +1,4 @@
+
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
@@ -7,17 +8,14 @@
 
 #include "tools/sst_dump_tool_imp.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <sstream>
 #include <vector>
 
+#include "db/blob_index.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "options/cf_options.h"
@@ -28,13 +26,13 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/utilities/ldb_cmd.h"
-#include "table/block.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_based_table_factory.h"
-#include "table/block_builder.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_builder.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "util/compression.h"
 #include "util/random.h"
@@ -45,11 +43,12 @@ namespace rocksdb {
 
 SstFileDumper::SstFileDumper(const Options& options,
                              const std::string& file_path, bool verify_checksum,
-                             bool output_hex)
+                             bool output_hex, bool decode_blob_index)
     : file_name_(file_path),
       read_num_(0),
       verify_checksum_(verify_checksum),
       output_hex_(output_hex),
+      decode_blob_index_(decode_blob_index),
       options_(options),
       ioptions_(options_),
       moptions_(ColumnFamilyOptions(options_)),
@@ -146,23 +145,25 @@ Status SstFileDumper::NewTableReader(
 }
 
 Status SstFileDumper::VerifyChecksum() {
-  return table_reader_->VerifyChecksum();
+  // We could pass specific readahead setting into read options if needed.
+  return table_reader_->VerifyChecksum(ReadOptions(),
+                                       TableReaderCaller::kSSTDumpTool);
 }
 
 Status SstFileDumper::DumpTable(const std::string& out_filename) {
   std::unique_ptr<WritableFile> out_file;
-  Env* env = Env::Default();
+  Env* env = options_.env;
   env->NewWritableFile(out_filename, &out_file, soptions_);
-  Status s = table_reader_->DumpTable(out_file.get(),
-                                      moptions_.prefix_extractor.get());
+  Status s = table_reader_->DumpTable(out_file.get());
   out_file->Close();
   return s;
 }
 
 uint64_t SstFileDumper::CalculateCompressedTableSize(
-    const TableBuilderOptions& tb_options, size_t block_size) {
+    const TableBuilderOptions& tb_options, size_t block_size,
+    uint64_t* num_data_blocks) {
   std::unique_ptr<WritableFile> out_file;
-  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  std::unique_ptr<Env> env(NewMemEnv(options_.env));
   env->NewWritableFile(testFileName, &out_file, soptions_);
   std::unique_ptr<WritableFileWriter> dest_writer;
   dest_writer.reset(
@@ -176,7 +177,8 @@ uint64_t SstFileDumper::CalculateCompressedTableSize(
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       dest_writer.get()));
   std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
-      ReadOptions(), moptions_.prefix_extractor.get()));
+      ReadOptions(), moptions_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     if (!iter->status().ok()) {
       fputs(iter->status().ToString().c_str(), stderr);
@@ -190,6 +192,8 @@ uint64_t SstFileDumper::CalculateCompressedTableSize(
     exit(1);
   }
   uint64_t size = table_builder->FileSize();
+  assert(num_data_blocks != nullptr);
+  *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
   env->DeleteFile(testFileName);
   return size;
 }
@@ -200,6 +204,8 @@ int SstFileDumper::ShowAllCompressionSizes(
         compression_types) {
   ReadOptions read_options;
   Options opts;
+  opts.statistics = rocksdb::CreateDBStatistics();
+  opts.statistics->set_stats_level(StatsLevel::kAll);
   const ImmutableCFOptions imoptions(opts);
   const ColumnFamilyOptions cfo(opts);
   const MutableCFOptions moptions(cfo);
@@ -218,16 +224,52 @@ int SstFileDumper::ShowAllCompressionSizes(
           imoptions, moptions, ikc, &block_based_table_factories, i.first,
           0 /* sample_for_compression */, compress_opt,
           false /* skip_filters */, column_family_name, unknown_level);
-      uint64_t file_size = CalculateCompressedTableSize(tb_opts, block_size);
-      fprintf(stdout, "Compression: %s", i.second);
-      fprintf(stdout, " Size: %" PRIu64 "\n", file_size);
+      uint64_t num_data_blocks = 0;
+      uint64_t file_size =
+          CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks);
+      fprintf(stdout, "Compression: %-24s", i.second);
+      fprintf(stdout, " Size: %10" PRIu64, file_size);
+      fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
+      const uint64_t compressed_blocks =
+          opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
+      const uint64_t not_compressed_blocks =
+          opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED);
+      // When the option enable_index_compression is true,
+      // NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
+      if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
+        num_data_blocks = compressed_blocks + not_compressed_blocks;
+      }
+      const uint64_t ratio_not_compressed_blocks =
+          (num_data_blocks - compressed_blocks) - not_compressed_blocks;
+      const double compressed_pcnt =
+          (0 == num_data_blocks) ? 0.0
+                                 : ((static_cast<double>(compressed_blocks) /
+                                     static_cast<double>(num_data_blocks)) *
+                                    100.0);
+      const double ratio_not_compressed_pcnt =
+          (0 == num_data_blocks)
+              ? 0.0
+              : ((static_cast<double>(ratio_not_compressed_blocks) /
+                  static_cast<double>(num_data_blocks)) *
+                 100.0);
+      const double not_compressed_pcnt =
+          (0 == num_data_blocks)
+              ? 0.0
+              : ((static_cast<double>(not_compressed_blocks) /
+                  static_cast<double>(num_data_blocks)) *
+                 100.0);
+      fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
+              compressed_pcnt);
+      fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
+              ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
+      fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
+              not_compressed_blocks, not_compressed_pcnt);
     } else {
       fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
     }
   }
   return 0;
 }
-
 Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
                                           RandomAccessFileReader* file,
                                           uint64_t file_size) {
@@ -302,7 +344,9 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
   }
 
   InternalIterator* iter = table_reader_->NewIterator(
-      ReadOptions(verify_checksum_, false), moptions_.prefix_extractor.get());
+      ReadOptions(verify_checksum_, false), moptions_.prefix_extractor.get(),
+      /*arena=*/nullptr, /*skip_filters=*/false,
+      TableReaderCaller::kSSTDumpTool);
   uint64_t i = 0;
   if (has_from) {
     InternalKey ikey;
@@ -337,9 +381,22 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
     }
 
     if (print_kv) {
-      fprintf(stdout, "%s => %s\n",
-          ikey.DebugString(output_hex_).c_str(),
-          value.ToString(output_hex_).c_str());
+      if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
+        fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
+                value.ToString(output_hex_).c_str());
+      } else {
+        BlobIndex blob_index;
+
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          fprintf(stderr, "%s => error decoding blob index\n",
+                  ikey.DebugString(output_hex_).c_str());
+          continue;
+        }
+
+        fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
+                blob_index.DebugString(output_hex_).c_str());
+      }
     }
   }
 
@@ -363,22 +420,29 @@ Status SstFileDumper::ReadTableProperties(
 namespace {
 
 void print_help() {
-  fprintf(stderr,
-          R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw]
+  fprintf(
+      stderr,
+      R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress]
     --file=<data_dir_OR_sst_file>
       Path to SST file or directory containing SST files
 
+    --env_uri=<uri of underlying Env>
+      URI of underlying Env
+
     --command=check|scan|raw|verify
-        check: Iterate over entries in files but dont print anything except if an error is encounterd (default command)
+        check: Iterate over entries in files but don't print anything except if an error is encountered (default command)
         scan: Iterate over entries in files and print them to screen
         raw: Dump all the table contents to <file_name>_dump.txt
-        verify: Iterate all the blocks in files verifying checksum to detect possible coruption but dont print anything except if a corruption is encountered
+        verify: Iterate all the blocks in files verifying checksum to detect possible corruption but don't print anything except if a corruption is encountered
         recompress: reports the SST file size if recompressed with different
                     compression types
 
     --output_hex
       Can be combined with scan command to print the keys and values in Hex
 
+    --decode_blob_index
+      Decode blob indexes and print them in a human-readable format during scans.
+
     --from=<user_key>
       Key to start reading from when executing check|scan
 
@@ -420,6 +484,7 @@ void print_help() {
 }  // namespace
 
 int SSTDumpTool::Run(int argc, char** argv, Options options) {
+  const char* env_uri = nullptr;
   const char* dir_or_file = nullptr;
   uint64_t read_num = std::numeric_limits<uint64_t>::max();
   std::string command;
@@ -428,6 +493,7 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
   uint64_t n;
   bool verify_checksum = false;
   bool output_hex = false;
+  bool decode_blob_index = false;
   bool input_key_hex = false;
   bool has_from = false;
   bool has_to = false;
@@ -446,15 +512,18 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
   uint64_t total_index_block_size = 0;
   uint64_t total_filter_block_size = 0;
   for (int i = 1; i < argc; i++) {
-    if (strncmp(argv[i], "--file=", 7) == 0) {
+    if (strncmp(argv[i], "--env_uri=", 10) == 0) {
+      env_uri = argv[i] + 10;
+    } else if (strncmp(argv[i], "--file=", 7) == 0) {
       dir_or_file = argv[i] + 7;
     } else if (strcmp(argv[i], "--output_hex") == 0) {
       output_hex = true;
+    } else if (strcmp(argv[i], "--decode_blob_index") == 0) {
+      decode_blob_index = true;
     } else if (strcmp(argv[i], "--input_key_hex") == 0) {
       input_key_hex = true;
-    } else if (sscanf(argv[i],
-               "--read_num=%lu%c",
-               (unsigned long*)&n, &junk) == 1) {
+    } else if (sscanf(argv[i], "--read_num=%lu%c", (unsigned long*)&n, &junk) ==
+               1) {
       read_num = n;
     } else if (strcmp(argv[i], "--verify_checksum") == 0) {
       verify_checksum = true;
@@ -546,6 +615,23 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
     exit(1);
   }
 
+  std::shared_ptr<rocksdb::Env> env_guard;
+
+  // If caller of SSTDumpTool::Run(...) does not specify a different env other
+  // than Env::Default(), then try to load custom env based on dir_or_file.
+  // Otherwise, the caller is responsible for creating custom env.
+  if (!options.env || options.env == rocksdb::Env::Default()) {
+    Env* env = Env::Default();
+    Status s = Env::LoadEnv(env_uri ? env_uri : "", &env, &env_guard);
+    if (!s.ok() && !s.IsNotFound()) {
+      fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+    options.env = env;
+  } else {
+    fprintf(stdout, "options.env is %p\n", options.env);
+  }
+
   std::vector<std::string> filenames;
   rocksdb::Env* env = options.env;
   rocksdb::Status st = env->GetChildren(dir_or_file, &filenames);
@@ -573,7 +659,7 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
     }
 
     rocksdb::SstFileDumper dumper(options, filename, verify_checksum,
-                                  output_hex);
+                                  output_hex, decode_blob_index);
     if (!dumper.getStatus().ok()) {
       fprintf(stderr, "%s: %s\n", filename.c_str(),
               dumper.getStatus().ToString().c_str());
@@ -654,17 +740,19 @@ int SSTDumpTool::Run(int argc, char** argv, Options options) {
         total_data_block_size += table_properties->data_size;
         total_index_block_size += table_properties->index_size;
         total_filter_block_size += table_properties->filter_size;
-      }
-      if (show_properties) {
-        fprintf(stdout,
-                "Raw user collected properties\n"
-                "------------------------------\n");
-        for (const auto& kv : table_properties->user_collected_properties) {
-          std::string prop_name = kv.first;
-          std::string prop_val = Slice(kv.second).ToString(true);
-          fprintf(stdout, "  # %s: 0x%s\n", prop_name.c_str(),
-                  prop_val.c_str());
+        if (show_properties) {
+          fprintf(stdout,
+                  "Raw user collected properties\n"
+                  "------------------------------\n");
+          for (const auto& kv : table_properties->user_collected_properties) {
+            std::string prop_name = kv.first;
+            std::string prop_val = Slice(kv.second).ToString(true);
+            fprintf(stdout, "  # %s: 0x%s\n", prop_name.c_str(),
+                    prop_val.c_str());
+          }
         }
+      } else {
+        fprintf(stderr, "Reader unexpectedly returned null properties\n");
       }
     }
   }
diff --git a/tools/sst_dump_tool_imp.h b/tools/sst_dump_tool_imp.h
index 846738a4041..5226502edfe 100644
--- a/tools/sst_dump_tool_imp.h
+++ b/tools/sst_dump_tool_imp.h
@@ -10,15 +10,16 @@
 #include <memory>
 #include <string>
 #include "db/dbformat.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
-#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
 class SstFileDumper {
  public:
   explicit SstFileDumper(const Options& options, const std::string& file_name,
-                         bool verify_checksum, bool output_hex);
+                         bool verify_checksum, bool output_hex,
+                         bool decode_blob_index);
 
   Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
                         const std::string& from_key, bool has_to,
@@ -46,7 +47,8 @@ class SstFileDumper {
                              RandomAccessFileReader* file, uint64_t file_size);
 
   uint64_t CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
-                                        size_t block_size);
+                                        size_t block_size,
+                                        uint64_t* num_data_blocks);
 
   Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
   Status SetOldTableOptions();
@@ -63,6 +65,7 @@ class SstFileDumper {
   uint64_t read_num_;
   bool verify_checksum_;
   bool output_hex_;
+  bool decode_blob_index_;
   EnvOptions soptions_;
 
   // options_ and internal_comparator_ will also be used in
diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc
index b2cc777d5a4..f2779224113 100644
--- a/tools/trace_analyzer_test.cc
+++ b/tools/trace_analyzer_test.cc
@@ -23,14 +23,15 @@ int main() {
 #include <thread>
 
 #include "db/db_test_util.h"
+#include "file/read_write_util.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 #include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "tools/trace_analyzer_tool.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/trace_replay.h"
+#include "trace_replay/trace_replay.h"
 
 namespace rocksdb {
 
diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index a0186925243..ae0b20bea51 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -6,10 +6,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #ifdef GFLAGS
 #ifdef NUMA
 #include <numa.h>
@@ -27,9 +23,11 @@
 #include <sstream>
 #include <stdexcept>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -41,16 +39,15 @@
 #include "rocksdb/utilities/ldb_cmd.h"
 #include "rocksdb/write_batch.h"
 #include "table/meta_blocks.h"
-#include "table/plain_table_factory.h"
+#include "table/plain/plain_table_factory.h"
 #include "table/table_reader.h"
 #include "tools/trace_analyzer_tool.h"
+#include "trace_replay/trace_replay.h"
 #include "util/coding.h"
 #include "util/compression.h"
-#include "util/file_reader_writer.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/trace_replay.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::RegisterFlagValidator;
@@ -576,8 +573,9 @@ Status TraceAnalyzer::MakeStatistics() {
       // output the access count distribution
       if (FLAGS_output_access_count_stats && stat.second.a_count_dist_f) {
         for (auto& record : stat.second.a_count_stats) {
-          ret = sprintf(buffer_, "access_count: %" PRIu64 " num: %" PRIu64 "\n",
-                        record.first, record.second);
+          ret = snprintf(buffer_, sizeof(buffer_),
+                         "access_count: %" PRIu64 " num: %" PRIu64 "\n",
+                         record.first, record.second);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -600,8 +598,8 @@ Status TraceAnalyzer::MakeStatistics() {
           get_mid = true;
         }
         if (FLAGS_output_key_distribution && stat.second.a_key_size_f) {
-          ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n", record.first,
-                        record.second);
+          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %" PRIu64 "\n",
+                         record.first, record.second);
           if (ret < 0) {
             return Status::IOError("Format output failed");
           }
@@ -628,10 +626,10 @@ Status TraceAnalyzer::MakeStatistics() {
         if (FLAGS_output_value_distribution && stat.second.a_value_size_f &&
             (type == TraceOperationType::kPut ||
              type == TraceOperationType::kMerge)) {
-          ret = sprintf(buffer_,
-                        "Number_of_value_size_between %" PRIu64 " and %" PRIu64
-                        " is: %" PRIu64 "\n",
-                        v_begin, v_end, record.second);
+          ret = snprintf(buffer_, sizeof(buffer_),
+                         "Number_of_value_size_between %" PRIu64 " and %" PRIu64
+                         " is: %" PRIu64 "\n",
+                         v_begin, v_end, record.second);
           if (ret < 0) {
             return Status::IOError("Format output failed");
           }
@@ -679,9 +677,10 @@ Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
       succ_ratio = (static_cast<double>(record.second.succ_count)) /
                    record.second.access_count;
     }
-    ret = sprintf(buffer_, "%u %zu %" PRIu64 " %" PRIu64 " %f\n",
-                  record.second.cf_id, record.second.value_size,
-                  record.second.key_id, record.second.access_count, succ_ratio);
+    ret = snprintf(buffer_, sizeof(buffer_),
+                   "%u %zu %" PRIu64 " %" PRIu64 " %f\n", record.second.cf_id,
+                   record.second.value_size, record.second.key_id,
+                   record.second.access_count, succ_ratio);
     if (ret < 0) {
       return Status::IOError("Format output failed");
     }
@@ -707,9 +706,11 @@ Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
           prefix_succ_ratio =
               (static_cast<double>(prefix_succ_access)) / prefix_access;
         }
-        ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
-                      record.second.key_id, prefix_access, prefix_count,
-                      prefix_ave_access, prefix_succ_ratio, prefix_out.c_str());
+        ret =
+            snprintf(buffer_, sizeof(buffer_),
+                     "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
+                     record.second.key_id, prefix_access, prefix_count,
+                     prefix_ave_access, prefix_succ_ratio, prefix_out.c_str());
         if (ret < 0) {
           return Status::IOError("Format output failed");
         }
@@ -813,7 +814,7 @@ Status TraceAnalyzer::MakeStatisticQPS() {
         }
         if (stat.second.a_qps_f) {
           while (time_line < time_it.first) {
-            ret = sprintf(buffer_, "%u\n", 0);
+            ret = snprintf(buffer_, sizeof(buffer_), "%u\n", 0);
             if (ret < 0) {
               return Status::IOError("Format the output failed");
             }
@@ -825,7 +826,7 @@ Status TraceAnalyzer::MakeStatisticQPS() {
             }
             time_line++;
           }
-          ret = sprintf(buffer_, "%u\n", time_it.second);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u\n", time_it.second);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -874,7 +875,8 @@ Status TraceAnalyzer::MakeStatisticQPS() {
             cur_ratio = (static_cast<double>(find_time->second)) / cur_uni_key;
             cur_num = find_time->second;
           }
-          ret = sprintf(buffer_, "%" PRIu64 " %.12f\n", cur_num, cur_ratio);
+          ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %.12f\n",
+                         cur_num, cur_ratio);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -891,9 +893,9 @@ Status TraceAnalyzer::MakeStatisticQPS() {
       // output the prefix of top k access peak
       if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) {
         while (!stat.second.top_k_qps_sec.empty()) {
-          ret = sprintf(buffer_, "At time: %u with QPS: %u\n",
-                        stat.second.top_k_qps_sec.top().second,
-                        stat.second.top_k_qps_sec.top().first);
+          ret = snprintf(buffer_, sizeof(buffer_), "At time: %u with QPS: %u\n",
+                         stat.second.top_k_qps_sec.top().second,
+                         stat.second.top_k_qps_sec.top().first);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -910,8 +912,9 @@ Status TraceAnalyzer::MakeStatisticQPS() {
             for (auto& qps_prefix : stat.second.a_qps_prefix_stats[qps_time]) {
               std::string qps_prefix_out =
                   rocksdb::LDBCommand::StringToHex(qps_prefix.first);
-              ret = sprintf(buffer_, "The prefix: %s Access count: %u\n",
-                            qps_prefix_out.c_str(), qps_prefix.second);
+              ret = snprintf(buffer_, sizeof(buffer_),
+                             "The prefix: %s Access count: %u\n",
+                             qps_prefix_out.c_str(), qps_prefix.second);
               if (ret < 0) {
                 return Status::IOError("Format the output failed");
               }
@@ -932,9 +935,9 @@ Status TraceAnalyzer::MakeStatisticQPS() {
     for (uint32_t i = 0; i < duration; i++) {
       for (int type = 0; type <= kTaTypeNum; type++) {
         if (type < kTaTypeNum) {
-          ret = sprintf(buffer_, "%u ", type_qps[i][type]);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u ", type_qps[i][type]);
         } else {
-          ret = sprintf(buffer_, "%u\n", type_qps[i][type]);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u\n", type_qps[i][type]);
         }
         if (ret < 0) {
           return Status::IOError("Format the output failed");
@@ -963,9 +966,9 @@ Status TraceAnalyzer::MakeStatisticQPS() {
           v = 0;
         }
         if (cf < cfs_size - 1) {
-          ret = sprintf(buffer_, "%u ", v);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u ", v);
         } else {
-          ret = sprintf(buffer_, "%u\n", v);
+          ret = snprintf(buffer_, sizeof(buffer_), "%u\n", v);
         }
         if (ret < 0) {
           return Status::IOError("Format the output failed");
@@ -1020,9 +1023,10 @@ Status TraceAnalyzer::ReProcessing() {
           if (found != stat.a_key_stats.end()) {
             key_id = found->second.key_id;
           }
-          ret = sprintf(buffer_, "%u %" PRIu64 " %" PRIu64 "\n",
-                        stat.time_series.front().type,
-                        stat.time_series.front().ts, key_id);
+          ret =
+              snprintf(buffer_, sizeof(buffer_), "%u %" PRIu64 " %" PRIu64 "\n",
+                       stat.time_series.front().type,
+                       stat.time_series.front().ts, key_id);
           if (ret < 0) {
             return Status::IOError("Format the output failed");
           }
@@ -1068,9 +1072,9 @@ Status TraceAnalyzer::ReProcessing() {
             TraceStats& stat = ta_[type].stats[cf_id];
             if (stat.w_key_f) {
               if (stat.a_key_stats.find(input_key) != stat.a_key_stats.end()) {
-                ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n",
-                              cfs_[cf_id].w_count,
-                              stat.a_key_stats[input_key].access_count);
+                ret = snprintf(buffer_, sizeof(buffer_),
+                               "%" PRIu64 " %" PRIu64 "\n", cfs_[cf_id].w_count,
+                               stat.a_key_stats[input_key].access_count);
                 if (ret < 0) {
                   return Status::IOError("Format the output failed");
                 }
@@ -1090,8 +1094,8 @@ Status TraceAnalyzer::ReProcessing() {
                 prefix[type] = input_key.substr(0, FLAGS_output_prefix_cut);
                 std::string prefix_out =
                     rocksdb::LDBCommand::StringToHex(prefix[type]);
-                ret = sprintf(buffer_, "%" PRIu64 " %s\n", cfs_[cf_id].w_count,
-                              prefix_out.c_str());
+                ret = snprintf(buffer_, sizeof(buffer_), "%" PRIu64 " %s\n",
+                               cfs_[cf_id].w_count, prefix_out.c_str());
                 if (ret < 0) {
                   return Status::IOError("Format the output failed");
                 }
@@ -1907,8 +1911,8 @@ Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type,
                                          const uint64_t ts) {
   std::string hex_key = rocksdb::LDBCommand::StringToHex(key);
   int ret;
-  ret =
-      sprintf(buffer_, "%u %u %zu %" PRIu64 "\n", type, cf_id, value_size, ts);
+  ret = snprintf(buffer_, sizeof(buffer_), "%u %u %zu %" PRIu64 "\n", type,
+                 cf_id, value_size, ts);
   if (ret < 0) {
     return Status::IOError("failed to format the output");
   }
diff --git a/tools/trace_analyzer_tool.h b/tools/trace_analyzer_tool.h
index be96f5005da..4c3b973b79c 100644
--- a/tools/trace_analyzer_tool.h
+++ b/tools/trace_analyzer_tool.h
@@ -16,7 +16,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/trace_reader_writer.h"
 #include "rocksdb/write_batch.h"
-#include "util/trace_replay.h"
+#include "trace_replay/trace_replay.h"
 
 namespace rocksdb {
 
diff --git a/tools/write_stress.cc b/tools/write_stress.cc
index ddb1d0aed03..7e7a1a76541 100644
--- a/tools/write_stress.cc
+++ b/tools/write_stress.cc
@@ -56,23 +56,19 @@ int main() {
 }
 #else
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif // __STDC_FORMAT_MACROS
-
-#include <inttypes.h>
 #include <atomic>
+#include <cinttypes>
 #include <random>
 #include <set>
 #include <string>
 #include <thread>
 
+#include "file/filename.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
-#include "util/filename.h"
 #include "util/gflags_compat.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
diff --git a/tools/write_stress_runner.py b/tools/write_stress_runner.py
index 9a0e920a724..fc0c99c235a 100644
--- a/tools/write_stress_runner.py
+++ b/tools/write_stress_runner.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#!/usr/bin/env python2
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 import subprocess
 import argparse
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
new file mode 100644
index 00000000000..0cf394afa3e
--- /dev/null
+++ b/trace_replay/block_cache_tracer.cc
@@ -0,0 +1,496 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/block_cache_tracer.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+namespace {
+const unsigned int kCharSize = 1;
+
+bool ShouldTrace(const Slice& block_key, const TraceOptions& trace_options) {
+  if (trace_options.sampling_frequency == 0 ||
+      trace_options.sampling_frequency == 1) {
+    return true;
+  }
+  // We use spatial downsampling so that we have a complete access history for a
+  // block.
+  return 0 == fastrange64(GetSliceNPHash64(block_key),
+                          trace_options.sampling_frequency);
+}
+}  // namespace
+
+const uint64_t kMicrosInSecond = 1000 * 1000;
+const uint64_t kSecondInMinute = 60;
+const uint64_t kSecondInHour = 3600;
+const std::string BlockCacheTraceHelper::kUnknownColumnFamilyName =
+    "UnknownColumnFamily";
+const uint64_t BlockCacheTraceHelper::kReservedGetId = 0;
+
+bool BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(
+    TraceType block_type, TableReaderCaller caller) {
+  return (block_type == TraceType::kBlockTraceDataBlock) &&
+         IsGetOrMultiGet(caller);
+}
+
+bool BlockCacheTraceHelper::IsGetOrMultiGet(TableReaderCaller caller) {
+  return caller == TableReaderCaller::kUserGet ||
+         caller == TableReaderCaller::kUserMultiGet;
+}
+
+bool BlockCacheTraceHelper::IsUserAccess(TableReaderCaller caller) {
+  return caller == TableReaderCaller::kUserGet ||
+         caller == TableReaderCaller::kUserMultiGet ||
+         caller == TableReaderCaller::kUserIterator ||
+         caller == TableReaderCaller::kUserApproximateSize ||
+         caller == TableReaderCaller::kUserVerifyChecksum;
+}
+
+std::string BlockCacheTraceHelper::ComputeRowKey(
+    const BlockCacheTraceRecord& access) {
+  if (!IsGetOrMultiGet(access.caller)) {
+    return "";
+  }
+  Slice key = ExtractUserKey(access.referenced_key);
+  return std::to_string(access.sst_fd_number) + "_" + key.ToString();
+}
+
+uint64_t BlockCacheTraceHelper::GetTableId(
+    const BlockCacheTraceRecord& access) {
+  if (!IsGetOrMultiGet(access.caller) || access.referenced_key.size() < 4) {
+    return 0;
+  }
+  return static_cast<uint64_t>(DecodeFixed32(access.referenced_key.data())) + 1;
+}
+
+uint64_t BlockCacheTraceHelper::GetSequenceNumber(
+    const BlockCacheTraceRecord& access) {
+  if (!IsGetOrMultiGet(access.caller)) {
+    return 0;
+  }
+  return access.get_from_user_specified_snapshot == Boolean::kFalse
+             ? 0
+             : 1 + GetInternalKeySeqno(access.referenced_key);
+}
+
+uint64_t BlockCacheTraceHelper::GetBlockOffsetInFile(
+    const BlockCacheTraceRecord& access) {
+  Slice input(access.block_key);
+  uint64_t offset = 0;
+  while (true) {
+    uint64_t tmp = 0;
+    if (GetVarint64(&input, &tmp)) {
+      offset = tmp;
+    } else {
+      break;
+    }
+  }
+  return offset;
+}
+
+BlockCacheTraceWriter::BlockCacheTraceWriter(
+    Env* env, const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer)
+    : env_(env),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)) {}
+
+Status BlockCacheTraceWriter::WriteBlockAccess(
+    const BlockCacheTraceRecord& record, const Slice& block_key,
+    const Slice& cf_name, const Slice& referenced_key) {
+  uint64_t trace_file_size = trace_writer_->GetFileSize();
+  if (trace_file_size > trace_options_.max_trace_file_size) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = record.access_timestamp;
+  trace.type = record.block_type;
+  PutLengthPrefixedSlice(&trace.payload, block_key);
+  PutFixed64(&trace.payload, record.block_size);
+  PutFixed64(&trace.payload, record.cf_id);
+  PutLengthPrefixedSlice(&trace.payload, cf_name);
+  PutFixed32(&trace.payload, record.level);
+  PutFixed64(&trace.payload, record.sst_fd_number);
+  trace.payload.push_back(record.caller);
+  trace.payload.push_back(record.is_cache_hit);
+  trace.payload.push_back(record.no_insert);
+  if (BlockCacheTraceHelper::IsGetOrMultiGet(record.caller)) {
+    PutFixed64(&trace.payload, record.get_id);
+    trace.payload.push_back(record.get_from_user_specified_snapshot);
+    PutLengthPrefixedSlice(&trace.payload, referenced_key);
+  }
+  if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record.block_type,
+                                                        record.caller)) {
+    PutFixed64(&trace.payload, record.referenced_data_size);
+    PutFixed64(&trace.payload, record.num_keys_in_block);
+    trace.payload.push_back(record.referenced_key_exist_in_block);
+  }
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  return trace_writer_->Write(encoded_trace);
+}
+
+Status BlockCacheTraceWriter::WriteHeader() {
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = TraceType::kTraceBegin;
+  PutLengthPrefixedSlice(&trace.payload, kTraceMagic);
+  PutFixed32(&trace.payload, kMajorVersion);
+  PutFixed32(&trace.payload, kMinorVersion);
+  std::string encoded_trace;
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
+  return trace_writer_->Write(encoded_trace);
+}
+
+BlockCacheTraceReader::BlockCacheTraceReader(
+    std::unique_ptr<TraceReader>&& reader)
+    : trace_reader_(std::move(reader)) {}
+
+Status BlockCacheTraceReader::ReadHeader(BlockCacheTraceHeader* header) {
+  assert(header != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  header->start_time = trace.ts;
+  Slice enc_slice = Slice(trace.payload);
+  Slice magnic_number;
+  if (!GetLengthPrefixedSlice(&enc_slice, &magnic_number)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read the magic number.");
+  }
+  if (magnic_number.ToString() != kTraceMagic) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Magic number does not match.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_major_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb major "
+        "version number.");
+  }
+  if (!GetFixed32(&enc_slice, &header->rocksdb_minor_version)) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: Failed to read rocksdb minor "
+        "version number.");
+  }
+  // We should have retrieved all information in the header.
+  if (!enc_slice.empty()) {
+    return Status::Corruption(
+        "Corrupted header in the trace file: The length of header is too "
+        "long.");
+  }
+  return Status::OK();
+}
+
+Status BlockCacheTraceReader::ReadAccess(BlockCacheTraceRecord* record) {
+  assert(record);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+  Trace trace;
+  s = TracerHelper::DecodeTrace(encoded_trace, &trace);
+  if (!s.ok()) {
+    return s;
+  }
+  record->access_timestamp = trace.ts;
+  record->block_type = trace.type;
+  Slice enc_slice = Slice(trace.payload);
+
+  Slice block_key;
+  if (!GetLengthPrefixedSlice(&enc_slice, &block_key)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read block key.");
+  }
+  record->block_key = block_key.ToString();
+  if (!GetFixed64(&enc_slice, &record->block_size)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read block size.");
+  }
+  if (!GetFixed64(&enc_slice, &record->cf_id)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read column family ID.");
+  }
+  Slice cf_name;
+  if (!GetLengthPrefixedSlice(&enc_slice, &cf_name)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read column family name.");
+  }
+  record->cf_name = cf_name.ToString();
+  if (!GetFixed32(&enc_slice, &record->level)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read level.");
+  }
+  if (!GetFixed64(&enc_slice, &record->sst_fd_number)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read SST file number.");
+  }
+  if (enc_slice.empty()) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read caller.");
+  }
+  record->caller = static_cast<TableReaderCaller>(enc_slice[0]);
+  enc_slice.remove_prefix(kCharSize);
+  if (enc_slice.empty()) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read is_cache_hit.");
+  }
+  record->is_cache_hit = static_cast<Boolean>(enc_slice[0]);
+  enc_slice.remove_prefix(kCharSize);
+  if (enc_slice.empty()) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read no_insert.");
+  }
+  record->no_insert = static_cast<Boolean>(enc_slice[0]);
+  enc_slice.remove_prefix(kCharSize);
+  if (BlockCacheTraceHelper::IsGetOrMultiGet(record->caller)) {
+    if (!GetFixed64(&enc_slice, &record->get_id)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the get id.");
+    }
+    if (enc_slice.empty()) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read "
+          "get_from_user_specified_snapshot.");
+    }
+    record->get_from_user_specified_snapshot =
+        static_cast<Boolean>(enc_slice[0]);
+    enc_slice.remove_prefix(kCharSize);
+    Slice referenced_key;
+    if (!GetLengthPrefixedSlice(&enc_slice, &referenced_key)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the referenced key.");
+    }
+    record->referenced_key = referenced_key.ToString();
+  }
+  if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(record->block_type,
+                                                        record->caller)) {
+    if (!GetFixed64(&enc_slice, &record->referenced_data_size)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the referenced data size.");
+    }
+    if (!GetFixed64(&enc_slice, &record->num_keys_in_block)) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read the number of keys in the "
+          "block.");
+    }
+    if (enc_slice.empty()) {
+      return Status::Incomplete(
+          "Incomplete access record: Failed to read "
+          "referenced_key_exist_in_block.");
+    }
+    record->referenced_key_exist_in_block = static_cast<Boolean>(enc_slice[0]);
+  }
+  return Status::OK();
+}
+
+BlockCacheHumanReadableTraceWriter::~BlockCacheHumanReadableTraceWriter() {
+  if (human_readable_trace_file_writer_) {
+    human_readable_trace_file_writer_->Flush();
+    human_readable_trace_file_writer_->Close();
+  }
+}
+
+Status BlockCacheHumanReadableTraceWriter::NewWritableFile(
+    const std::string& human_readable_trace_file_path, rocksdb::Env* env) {
+  if (human_readable_trace_file_path.empty()) {
+    return Status::InvalidArgument(
+        "The provided human_readable_trace_file_path is null.");
+  }
+  return env->NewWritableFile(human_readable_trace_file_path,
+                              &human_readable_trace_file_writer_, EnvOptions());
+}
+
+Status BlockCacheHumanReadableTraceWriter::WriteHumanReadableTraceRecord(
+    const BlockCacheTraceRecord& access, uint64_t block_id,
+    uint64_t get_key_id) {
+  if (!human_readable_trace_file_writer_) {
+    return Status::OK();
+  }
+  int ret = snprintf(
+      trace_record_buffer_, sizeof(trace_record_buffer_),
+      "%" PRIu64 ",%" PRIu64 ",%u,%" PRIu64 ",%" PRIu64 ",%s,%" PRIu32
+      ",%" PRIu64 ",%u,%u,%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%u,%u,%" PRIu64
+      ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n",
+      access.access_timestamp, block_id, access.block_type, access.block_size,
+      access.cf_id, access.cf_name.c_str(), access.level, access.sst_fd_number,
+      access.caller, access.no_insert, access.get_id, get_key_id,
+      access.referenced_data_size, access.is_cache_hit,
+      access.referenced_key_exist_in_block, access.num_keys_in_block,
+      BlockCacheTraceHelper::GetTableId(access),
+      BlockCacheTraceHelper::GetSequenceNumber(access),
+      static_cast<uint64_t>(access.block_key.size()),
+      static_cast<uint64_t>(access.referenced_key.size()),
+      BlockCacheTraceHelper::GetBlockOffsetInFile(access));
+  if (ret < 0) {
+    return Status::IOError("failed to format the output");
+  }
+  std::string printout(trace_record_buffer_);
+  return human_readable_trace_file_writer_->Append(printout);
+}
+
+BlockCacheHumanReadableTraceReader::BlockCacheHumanReadableTraceReader(
+    const std::string& trace_file_path)
+    : BlockCacheTraceReader(/*trace_reader=*/nullptr) {
+  human_readable_trace_reader_.open(trace_file_path, std::ifstream::in);
+}
+
+BlockCacheHumanReadableTraceReader::~BlockCacheHumanReadableTraceReader() {
+  human_readable_trace_reader_.close();
+}
+
+Status BlockCacheHumanReadableTraceReader::ReadHeader(
+    BlockCacheTraceHeader* /*header*/) {
+  return Status::OK();
+}
+
+Status BlockCacheHumanReadableTraceReader::ReadAccess(
+    BlockCacheTraceRecord* record) {
+  std::string line;
+  if (!std::getline(human_readable_trace_reader_, line)) {
+    return Status::Incomplete("No more records to read.");
+  }
+  std::stringstream ss(line);
+  std::vector<std::string> record_strs;
+  while (ss.good()) {
+    std::string substr;
+    getline(ss, substr, ',');
+    record_strs.push_back(substr);
+  }
+  if (record_strs.size() != 21) {
+    return Status::Incomplete("Records format is wrong.");
+  }
+
+  record->access_timestamp = ParseUint64(record_strs[0]);
+  uint64_t block_key = ParseUint64(record_strs[1]);
+  record->block_type = static_cast<TraceType>(ParseUint64(record_strs[2]));
+  record->block_size = ParseUint64(record_strs[3]);
+  record->cf_id = ParseUint64(record_strs[4]);
+  record->cf_name = record_strs[5];
+  record->level = static_cast<uint32_t>(ParseUint64(record_strs[6]));
+  record->sst_fd_number = ParseUint64(record_strs[7]);
+  record->caller = static_cast<TableReaderCaller>(ParseUint64(record_strs[8]));
+  record->no_insert = static_cast<Boolean>(ParseUint64(record_strs[9]));
+  record->get_id = ParseUint64(record_strs[10]);
+  uint64_t get_key_id = ParseUint64(record_strs[11]);
+
+  record->referenced_data_size = ParseUint64(record_strs[12]);
+  record->is_cache_hit = static_cast<Boolean>(ParseUint64(record_strs[13]));
+  record->referenced_key_exist_in_block =
+      static_cast<Boolean>(ParseUint64(record_strs[14]));
+  record->num_keys_in_block = ParseUint64(record_strs[15]);
+  uint64_t table_id = ParseUint64(record_strs[16]);
+  if (table_id > 0) {
+    // Decrement since valid table id in the trace file equals traced table id
+    // + 1.
+    table_id -= 1;
+  }
+  uint64_t get_sequence_number = ParseUint64(record_strs[17]);
+  if (get_sequence_number > 0) {
+    record->get_from_user_specified_snapshot = Boolean::kTrue;
+    // Decrement since valid seq number in the trace file equals traced seq
+    // number + 1.
+    get_sequence_number -= 1;
+  }
+  uint64_t block_key_size = ParseUint64(record_strs[18]);
+  uint64_t get_key_size = ParseUint64(record_strs[19]);
+  uint64_t block_offset = ParseUint64(record_strs[20]);
+
+  std::string tmp_block_key;
+  PutVarint64(&tmp_block_key, block_key);
+  PutVarint64(&tmp_block_key, block_offset);
+  // Append 1 until the size is the same as traced block key size.
+  while (record->block_key.size() < block_key_size - tmp_block_key.size()) {
+    record->block_key += "1";
+  }
+  record->block_key += tmp_block_key;
+
+  if (get_key_id != 0) {
+    std::string tmp_get_key;
+    PutFixed64(&tmp_get_key, get_key_id);
+    PutFixed64(&tmp_get_key, get_sequence_number << 8);
+    PutFixed32(&record->referenced_key, static_cast<uint32_t>(table_id));
+    // Append 1 until the size is the same as traced key size.
+    while (record->referenced_key.size() < get_key_size - tmp_get_key.size()) {
+      record->referenced_key += "1";
+    }
+    record->referenced_key += tmp_get_key;
+  }
+  return Status::OK();
+}
+
+BlockCacheTracer::BlockCacheTracer() { writer_.store(nullptr); }
+
+BlockCacheTracer::~BlockCacheTracer() { EndTrace(); }
+
+Status BlockCacheTracer::StartTrace(
+    Env* env, const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (writer_.load()) {
+    return Status::Busy();
+  }
+  get_id_counter_.store(1);
+  trace_options_ = trace_options;
+  writer_.store(
+      new BlockCacheTraceWriter(env, trace_options, std::move(trace_writer)));
+  return writer_.load()->WriteHeader();
+}
+
+void BlockCacheTracer::EndTrace() {
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return;
+  }
+  delete writer_.load();
+  writer_.store(nullptr);
+}
+
+Status BlockCacheTracer::WriteBlockAccess(const BlockCacheTraceRecord& record,
+                                          const Slice& block_key,
+                                          const Slice& cf_name,
+                                          const Slice& referenced_key) {
+  if (!writer_.load() || !ShouldTrace(block_key, trace_options_)) {
+    return Status::OK();
+  }
+  InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
+  if (!writer_.load()) {
+    return Status::OK();
+  }
+  return writer_.load()->WriteBlockAccess(record, block_key, cf_name,
+                                          referenced_key);
+}
+
+uint64_t BlockCacheTracer::NextGetId() {
+  if (!writer_.load(std::memory_order_relaxed)) {
+    return BlockCacheTraceHelper::kReservedGetId;
+  }
+  uint64_t prev_value = get_id_counter_.fetch_add(1);
+  if (prev_value == BlockCacheTraceHelper::kReservedGetId) {
+    // fetch and add again.
+    return get_id_counter_.fetch_add(1);
+  }
+  return prev_value;
+}
+
+}  // namespace rocksdb
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
new file mode 100644
index 00000000000..7c03e611df2
--- /dev/null
+++ b/trace_replay/block_cache_tracer.h
@@ -0,0 +1,293 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <fstream>
+
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "table/table_reader_caller.h"
+#include "trace_replay/trace_replay.h"
+
+namespace rocksdb {
+
+extern const uint64_t kMicrosInSecond;
+extern const uint64_t kSecondInMinute;
+extern const uint64_t kSecondInHour;
+
+struct BlockCacheTraceRecord;
+
+class BlockCacheTraceHelper {
+ public:
+  static bool IsGetOrMultiGetOnDataBlock(TraceType block_type,
+                                         TableReaderCaller caller);
+  static bool IsGetOrMultiGet(TableReaderCaller caller);
+  static bool IsUserAccess(TableReaderCaller caller);
+  // Row key is a concatenation of the access's fd_number and the referenced
+  // user key.
+  static std::string ComputeRowKey(const BlockCacheTraceRecord& access);
+  // The first four bytes of the referenced key in a Get request is the table
+  // id.
+  static uint64_t GetTableId(const BlockCacheTraceRecord& access);
+  // The sequence number of a get request is the last part of the referenced
+  // key.
+  static uint64_t GetSequenceNumber(const BlockCacheTraceRecord& access);
+  // Block offset in a file is the last varint64 in the block key.
+  static uint64_t GetBlockOffsetInFile(const BlockCacheTraceRecord& access);
+
+  static const std::string kUnknownColumnFamilyName;
+  static const uint64_t kReservedGetId;
+};
+
+// Lookup context for tracing block cache accesses.
+// We trace block accesses at five places:
+// 1. BlockBasedTable::GetFilter
+// 2. BlockBasedTable::GetUncompressedDict.
+// 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index,
+// and range deletion block.)
+// 4. BlockBasedTable::Get. (To trace the referenced key and whether the
+// referenced key exists in a fetched data block.)
+// 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the
+// referenced key exists in a fetched data block.)
+// The context is created at:
+// 1. BlockBasedTable::Get. (kUserGet)
+// 2. BlockBasedTable::MultiGet. (kUserMGet)
+// 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or
+// external SST ingestion calls this function.)
+// 4. BlockBasedTable::Open. (kPrefetch)
+// 5. Index/Filter::CacheDependencies. (kPrefetch)
+// 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or
+// kUserApproximateSize).
+struct BlockCacheLookupContext {
+  BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {}
+  BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id,
+                          bool _get_from_user_specified_snapshot)
+      : caller(_caller),
+        get_id(_get_id),
+        get_from_user_specified_snapshot(_get_from_user_specified_snapshot) {}
+  const TableReaderCaller caller;
+  // These are populated when we perform lookup/insert on block cache. The block
+  // cache tracer uses these inforation when logging the block access at
+  // BlockBasedTable::GET and BlockBasedTable::MultiGet.
+  bool is_cache_hit = false;
+  bool no_insert = false;
+  TraceType block_type = TraceType::kTraceMax;
+  uint64_t block_size = 0;
+  std::string block_key;
+  uint64_t num_keys_in_block = 0;
+  // The unique id associated with Get and MultiGet. This enables us to track
+  // how many blocks a Get/MultiGet request accesses. We can also measure the
+  // impact of row cache vs block cache.
+  uint64_t get_id = 0;
+  std::string referenced_key;
+  bool get_from_user_specified_snapshot = false;
+
+  void FillLookupContext(bool _is_cache_hit, bool _no_insert,
+                         TraceType _block_type, uint64_t _block_size,
+                         const std::string& _block_key,
+                         uint64_t _num_keys_in_block) {
+    is_cache_hit = _is_cache_hit;
+    no_insert = _no_insert;
+    block_type = _block_type;
+    block_size = _block_size;
+    block_key = _block_key;
+    num_keys_in_block = _num_keys_in_block;
+  }
+};
+
+enum Boolean : char { kTrue = 1, kFalse = 0 };
+
+struct BlockCacheTraceRecord {
+  // Required fields for all accesses.
+  uint64_t access_timestamp = 0;
+  std::string block_key;
+  TraceType block_type = TraceType::kTraceMax;
+  uint64_t block_size = 0;
+  uint64_t cf_id = 0;
+  std::string cf_name;
+  uint32_t level = 0;
+  uint64_t sst_fd_number = 0;
+  TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
+  Boolean is_cache_hit = Boolean::kFalse;
+  Boolean no_insert = Boolean::kFalse;
+  // Required field for Get and MultiGet
+  uint64_t get_id = BlockCacheTraceHelper::kReservedGetId;
+  Boolean get_from_user_specified_snapshot = Boolean::kFalse;
+  std::string referenced_key;
+  // Required fields for data block and user Get/Multi-Get only.
+  uint64_t referenced_data_size = 0;
+  uint64_t num_keys_in_block = 0;
+  Boolean referenced_key_exist_in_block = Boolean::kFalse;
+
+  BlockCacheTraceRecord() {}
+
+  BlockCacheTraceRecord(
+      uint64_t _access_timestamp, std::string _block_key, TraceType _block_type,
+      uint64_t _block_size, uint64_t _cf_id, std::string _cf_name,
+      uint32_t _level, uint64_t _sst_fd_number, TableReaderCaller _caller,
+      bool _is_cache_hit, bool _no_insert,
+      uint64_t _get_id = BlockCacheTraceHelper::kReservedGetId,
+      bool _get_from_user_specified_snapshot = false,
+      std::string _referenced_key = "", uint64_t _referenced_data_size = 0,
+      uint64_t _num_keys_in_block = 0,
+      bool _referenced_key_exist_in_block = false)
+      : access_timestamp(_access_timestamp),
+        block_key(_block_key),
+        block_type(_block_type),
+        block_size(_block_size),
+        cf_id(_cf_id),
+        cf_name(_cf_name),
+        level(_level),
+        sst_fd_number(_sst_fd_number),
+        caller(_caller),
+        is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse),
+        no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse),
+        get_id(_get_id),
+        get_from_user_specified_snapshot(_get_from_user_specified_snapshot
+                                             ? Boolean::kTrue
+                                             : Boolean::kFalse),
+        referenced_key(_referenced_key),
+        referenced_data_size(_referenced_data_size),
+        num_keys_in_block(_num_keys_in_block),
+        referenced_key_exist_in_block(
+            _referenced_key_exist_in_block ? Boolean::kTrue : Boolean::kFalse) {
+  }
+};
+
+struct BlockCacheTraceHeader {
+  uint64_t start_time;
+  uint32_t rocksdb_major_version;
+  uint32_t rocksdb_minor_version;
+};
+
+// BlockCacheTraceWriter captures all RocksDB block cache accesses using a
+// user-provided TraceWriter. Every RocksDB operation is written as a single
+// trace. Each trace will have a timestamp and type, followed by the trace
+// payload.
+class BlockCacheTraceWriter {
+ public:
+  BlockCacheTraceWriter(Env* env, const TraceOptions& trace_options,
+                        std::unique_ptr<TraceWriter>&& trace_writer);
+  ~BlockCacheTraceWriter() = default;
+  // No copy and move.
+  BlockCacheTraceWriter(const BlockCacheTraceWriter&) = delete;
+  BlockCacheTraceWriter& operator=(const BlockCacheTraceWriter&) = delete;
+  BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete;
+  BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete;
+
+  // Pass Slice references to avoid copy.
+  Status WriteBlockAccess(const BlockCacheTraceRecord& record,
+                          const Slice& block_key, const Slice& cf_name,
+                          const Slice& referenced_key);
+
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number and RocksDB version.
+  Status WriteHeader();
+
+ private:
+  Env* env_;
+  TraceOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
+};
+
+// Write a trace record in human readable format, see
+// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
+// for details.
+class BlockCacheHumanReadableTraceWriter {
+ public:
+  ~BlockCacheHumanReadableTraceWriter();
+
+  Status NewWritableFile(const std::string& human_readable_trace_file_path,
+                         rocksdb::Env* env);
+
+  Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access,
+                                       uint64_t block_id, uint64_t get_key_id);
+
+ private:
+  char trace_record_buffer_[1024 * 1024];
+  std::unique_ptr<rocksdb::WritableFile> human_readable_trace_file_writer_;
+};
+
+// BlockCacheTraceReader helps read the trace file generated by
+// BlockCacheTraceWriter using a user provided TraceReader.
+class BlockCacheTraceReader {
+ public:
+  BlockCacheTraceReader(std::unique_ptr<TraceReader>&& reader);
+  ~BlockCacheTraceReader() = default;
+  // No copy and move.
+  BlockCacheTraceReader(const BlockCacheTraceReader&) = delete;
+  BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete;
+  BlockCacheTraceReader(BlockCacheTraceReader&&) = delete;
+  BlockCacheTraceReader& operator=(BlockCacheTraceReader&&) = delete;
+
+  Status ReadHeader(BlockCacheTraceHeader* header);
+
+  Status ReadAccess(BlockCacheTraceRecord* record);
+
+ private:
+  std::unique_ptr<TraceReader> trace_reader_;
+};
+
+// Read a trace record in human readable format, see
+// https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
+// for detailed.
+class BlockCacheHumanReadableTraceReader : public BlockCacheTraceReader {
+ public:
+  BlockCacheHumanReadableTraceReader(const std::string& trace_file_path);
+
+  ~BlockCacheHumanReadableTraceReader();
+
+  Status ReadHeader(BlockCacheTraceHeader* header);
+
+  Status ReadAccess(BlockCacheTraceRecord* record);
+
+ private:
+  std::ifstream human_readable_trace_reader_;
+};
+
+// A block cache tracer. It downsamples the accesses according to
+// trace_options and uses BlockCacheTraceWriter to write the access record to
+// the trace file.
+class BlockCacheTracer {
+ public:
+  BlockCacheTracer();
+  ~BlockCacheTracer();
+  // No copy and move.
+  BlockCacheTracer(const BlockCacheTracer&) = delete;
+  BlockCacheTracer& operator=(const BlockCacheTracer&) = delete;
+  BlockCacheTracer(BlockCacheTracer&&) = delete;
+  BlockCacheTracer& operator=(BlockCacheTracer&&) = delete;
+
+  // Start writing block cache accesses to the trace_writer.
+  Status StartTrace(Env* env, const TraceOptions& trace_options,
+                    std::unique_ptr<TraceWriter>&& trace_writer);
+
+  // Stop writing block cache accesses to the trace_writer.
+  void EndTrace();
+
+  bool is_tracing_enabled() const {
+    return writer_.load(std::memory_order_relaxed);
+  }
+
+  Status WriteBlockAccess(const BlockCacheTraceRecord& record,
+                          const Slice& block_key, const Slice& cf_name,
+                          const Slice& referenced_key);
+
+  // GetId cycles from 1 to port::kMaxUint64.
+  uint64_t NextGetId();
+
+ private:
+  TraceOptions trace_options_;
+  // A mutex protects the writer_.
+  InstrumentedMutex trace_writer_mutex_;
+  std::atomic<BlockCacheTraceWriter*> writer_;
+  std::atomic<uint64_t> get_id_counter_;
+};
+
+}  // namespace rocksdb
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
new file mode 100644
index 00000000000..cc245c30ed8
--- /dev/null
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -0,0 +1,378 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "trace_replay/block_cache_tracer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace rocksdb {
+
+namespace {
+const uint64_t kBlockSize = 1024;
+const std::string kBlockKeyPrefix = "test-block-";
+const uint32_t kCFId = 0;
+const uint32_t kLevel = 1;
+const uint64_t kSSTFDNumber = 100;
+const std::string kRefKeyPrefix = "test-get-";
+const uint64_t kNumKeysInBlock = 1024;
+const uint64_t kReferencedDataSize = 10;
+}  // namespace
+
+class BlockCacheTracerTest : public testing::Test {
+ public:
+  BlockCacheTracerTest() {
+    test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
+    env_ = rocksdb::Env::Default();
+    EXPECT_OK(env_->CreateDir(test_path_));
+    trace_file_path_ = test_path_ + "/block_cache_trace";
+  }
+
+  ~BlockCacheTracerTest() override {
+    EXPECT_OK(env_->DeleteFile(trace_file_path_));
+    EXPECT_OK(env_->DeleteDir(test_path_));
+  }
+
+  TableReaderCaller GetCaller(uint32_t key_id) {
+    uint32_t n = key_id % 5;
+    switch (n) {
+      case 0:
+        return TableReaderCaller::kPrefetch;
+      case 1:
+        return TableReaderCaller::kCompaction;
+      case 2:
+        return TableReaderCaller::kUserGet;
+      case 3:
+        return TableReaderCaller::kUserMultiGet;
+      case 4:
+        return TableReaderCaller::kUserIterator;
+    }
+    assert(false);
+  }
+
+  void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
+                        TraceType block_type, uint32_t nblocks) {
+    assert(writer);
+    for (uint32_t i = 0; i < nblocks; i++) {
+      uint32_t key_id = from_key_id + i;
+      BlockCacheTraceRecord record;
+      record.block_type = block_type;
+      record.block_size = kBlockSize + key_id;
+      record.block_key = (kBlockKeyPrefix + std::to_string(key_id));
+      record.access_timestamp = env_->NowMicros();
+      record.cf_id = kCFId;
+      record.cf_name = kDefaultColumnFamilyName;
+      record.caller = GetCaller(key_id);
+      record.level = kLevel;
+      record.sst_fd_number = kSSTFDNumber + key_id;
+      record.is_cache_hit = Boolean::kFalse;
+      record.no_insert = Boolean::kFalse;
+      // Provide get_id for all callers. The writer should only write get_id
+      // when the caller is either GET or MGET.
+      record.get_id = key_id + 1;
+      record.get_from_user_specified_snapshot = Boolean::kTrue;
+      // Provide these fields for all block types.
+      // The writer should only write these fields for data blocks and the
+      // caller is either GET or MGET.
+      record.referenced_key = (kRefKeyPrefix + std::to_string(key_id));
+      record.referenced_key_exist_in_block = Boolean::kTrue;
+      record.num_keys_in_block = kNumKeysInBlock;
+      record.referenced_data_size = kReferencedDataSize + key_id;
+      ASSERT_OK(writer->WriteBlockAccess(
+          record, record.block_key, record.cf_name, record.referenced_key));
+    }
+  }
+
+  BlockCacheTraceRecord GenerateAccessRecord() {
+    uint32_t key_id = 0;
+    BlockCacheTraceRecord record;
+    record.block_type = TraceType::kBlockTraceDataBlock;
+    record.block_size = kBlockSize;
+    record.block_key = kBlockKeyPrefix + std::to_string(key_id);
+    record.access_timestamp = env_->NowMicros();
+    record.cf_id = kCFId;
+    record.cf_name = kDefaultColumnFamilyName;
+    record.caller = GetCaller(key_id);
+    record.level = kLevel;
+    record.sst_fd_number = kSSTFDNumber + key_id;
+    record.is_cache_hit = Boolean::kFalse;
+    record.no_insert = Boolean::kFalse;
+    record.referenced_key = kRefKeyPrefix + std::to_string(key_id);
+    record.referenced_key_exist_in_block = Boolean::kTrue;
+    record.num_keys_in_block = kNumKeysInBlock;
+    return record;
+  }
+
+  void VerifyAccess(BlockCacheTraceReader* reader, uint32_t from_key_id,
+                    TraceType block_type, uint32_t nblocks) {
+    assert(reader);
+    for (uint32_t i = 0; i < nblocks; i++) {
+      uint32_t key_id = from_key_id + i;
+      BlockCacheTraceRecord record;
+      ASSERT_OK(reader->ReadAccess(&record));
+      ASSERT_EQ(block_type, record.block_type);
+      ASSERT_EQ(kBlockSize + key_id, record.block_size);
+      ASSERT_EQ(kBlockKeyPrefix + std::to_string(key_id), record.block_key);
+      ASSERT_EQ(kCFId, record.cf_id);
+      ASSERT_EQ(kDefaultColumnFamilyName, record.cf_name);
+      ASSERT_EQ(GetCaller(key_id), record.caller);
+      ASSERT_EQ(kLevel, record.level);
+      ASSERT_EQ(kSSTFDNumber + key_id, record.sst_fd_number);
+      ASSERT_EQ(Boolean::kFalse, record.is_cache_hit);
+      ASSERT_EQ(Boolean::kFalse, record.no_insert);
+      if (record.caller == TableReaderCaller::kUserGet ||
+          record.caller == TableReaderCaller::kUserMultiGet) {
+        ASSERT_EQ(key_id + 1, record.get_id);
+        ASSERT_EQ(Boolean::kTrue, record.get_from_user_specified_snapshot);
+        ASSERT_EQ(kRefKeyPrefix + std::to_string(key_id),
+                  record.referenced_key);
+      } else {
+        ASSERT_EQ(BlockCacheTraceHelper::kReservedGetId, record.get_id);
+        ASSERT_EQ(Boolean::kFalse, record.get_from_user_specified_snapshot);
+        ASSERT_EQ("", record.referenced_key);
+      }
+      if (block_type == TraceType::kBlockTraceDataBlock &&
+          (record.caller == TableReaderCaller::kUserGet ||
+           record.caller == TableReaderCaller::kUserMultiGet)) {
+        ASSERT_EQ(Boolean::kTrue, record.referenced_key_exist_in_block);
+        ASSERT_EQ(kNumKeysInBlock, record.num_keys_in_block);
+        ASSERT_EQ(kReferencedDataSize + key_id, record.referenced_data_size);
+        continue;
+      }
+      ASSERT_EQ(Boolean::kFalse, record.referenced_key_exist_in_block);
+      ASSERT_EQ(0, record.num_keys_in_block);
+      ASSERT_EQ(0, record.referenced_data_size);
+    }
+  }
+
+  Env* env_;
+  EnvOptions env_options_;
+  std::string trace_file_path_;
+  std::string test_path_;
+};
+
+TEST_F(BlockCacheTracerTest, AtomicWriteBeforeStartTrace) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  {
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTracer writer;
+    // The record should be written to the trace_file since StartTrace is not
+    // called.
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains nothing.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_NOK(reader.ReadHeader(&header));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, AtomicWrite) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTracer writer;
+    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
+    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
+    ASSERT_NOK(reader.ReadAccess(&record));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, ConsecutiveStartTrace) {
+  TraceOptions trace_opt;
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(
+      NewFileTraceWriter(env_, env_options_, trace_file_path_, &trace_writer));
+  BlockCacheTracer writer;
+  ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+  ASSERT_NOK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+  ASSERT_OK(env_->FileExists(trace_file_path_));
+}
+
+TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTracer writer;
+    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
+    writer.EndTrace();
+    // Write the record again. This time the record should not be written since
+    // EndTrace is called.
+    ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
+                                      record.referenced_key));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file contains one record.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
+    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
+    ASSERT_NOK(reader.ReadAccess(&record));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, NextGetId) {
+  BlockCacheTracer writer;
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    // next get id should always return 0 before we call StartTrace.
+    ASSERT_EQ(0, writer.NextGetId());
+    ASSERT_EQ(0, writer.NextGetId());
+    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_EQ(1, writer.NextGetId());
+    ASSERT_EQ(2, writer.NextGetId());
+    writer.EndTrace();
+    // next get id should return 0.
+    ASSERT_EQ(0, writer.NextGetId());
+  }
+
+  // Start trace again and next get id should return 1.
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_EQ(1, writer.NextGetId());
+  }
+}
+
+TEST_F(BlockCacheTracerTest, MixedBlocks) {
+  {
+    // Generate a trace file containing a mix of blocks.
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    ASSERT_OK(writer.WriteHeader());
+    // Write blocks of different types.
+    WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock,
+                     10);
+    WriteBlockAccess(&writer, 10, TraceType::kBlockTraceDataBlock, 10);
+    WriteBlockAccess(&writer, 20, TraceType::kBlockTraceFilterBlock, 10);
+    WriteBlockAccess(&writer, 30, TraceType::kBlockTraceIndexBlock, 10);
+    WriteBlockAccess(&writer, 40, TraceType::kBlockTraceRangeDeletionBlock, 10);
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    BlockCacheTraceReader reader(std::move(trace_reader));
+    BlockCacheTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
+    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    // Read blocks.
+    VerifyAccess(&reader, 0, TraceType::kBlockTraceUncompressionDictBlock, 10);
+    VerifyAccess(&reader, 10, TraceType::kBlockTraceDataBlock, 10);
+    VerifyAccess(&reader, 20, TraceType::kBlockTraceFilterBlock, 10);
+    VerifyAccess(&reader, 30, TraceType::kBlockTraceIndexBlock, 10);
+    VerifyAccess(&reader, 40, TraceType::kBlockTraceRangeDeletionBlock, 10);
+    // Read one more record should report an error.
+    BlockCacheTraceRecord record;
+    ASSERT_NOK(reader.ReadAccess(&record));
+  }
+}
+
+TEST_F(BlockCacheTracerTest, HumanReadableTrace) {
+  BlockCacheTraceRecord record = GenerateAccessRecord();
+  record.get_id = 1;
+  record.referenced_key = "";
+  record.caller = TableReaderCaller::kUserGet;
+  record.get_from_user_specified_snapshot = Boolean::kTrue;
+  record.referenced_data_size = kReferencedDataSize;
+  PutFixed32(&record.referenced_key, 111);
+  PutLengthPrefixedSlice(&record.referenced_key, "get_key");
+  PutFixed64(&record.referenced_key, 2 << 8);
+  PutLengthPrefixedSlice(&record.block_key, "block_key");
+  PutVarint64(&record.block_key, 333);
+  {
+    // Generate a human readable trace file.
+    BlockCacheHumanReadableTraceWriter writer;
+    ASSERT_OK(writer.NewWritableFile(trace_file_path_, env_));
+    ASSERT_OK(writer.WriteHumanReadableTraceRecord(record, 1, 1));
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    BlockCacheHumanReadableTraceReader reader(trace_file_path_);
+    BlockCacheTraceHeader header;
+    BlockCacheTraceRecord read_record;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_OK(reader.ReadAccess(&read_record));
+    ASSERT_EQ(TraceType::kBlockTraceDataBlock, read_record.block_type);
+    ASSERT_EQ(kBlockSize, read_record.block_size);
+    ASSERT_EQ(kCFId, read_record.cf_id);
+    ASSERT_EQ(kDefaultColumnFamilyName, read_record.cf_name);
+    ASSERT_EQ(TableReaderCaller::kUserGet, read_record.caller);
+    ASSERT_EQ(kLevel, read_record.level);
+    ASSERT_EQ(kSSTFDNumber, read_record.sst_fd_number);
+    ASSERT_EQ(Boolean::kFalse, read_record.is_cache_hit);
+    ASSERT_EQ(Boolean::kFalse, read_record.no_insert);
+    ASSERT_EQ(1, read_record.get_id);
+    ASSERT_EQ(Boolean::kTrue, read_record.get_from_user_specified_snapshot);
+    ASSERT_EQ(Boolean::kTrue, read_record.referenced_key_exist_in_block);
+    ASSERT_EQ(kNumKeysInBlock, read_record.num_keys_in_block);
+    ASSERT_EQ(kReferencedDataSize, read_record.referenced_data_size);
+    ASSERT_EQ(record.block_key.size(), read_record.block_key.size());
+    ASSERT_EQ(record.referenced_key.size(), record.referenced_key.size());
+    ASSERT_EQ(112, BlockCacheTraceHelper::GetTableId(read_record));
+    ASSERT_EQ(3, BlockCacheTraceHelper::GetSequenceNumber(read_record));
+    ASSERT_EQ(333, BlockCacheTraceHelper::GetBlockOffsetInFile(read_record));
+    // Read again should fail.
+    ASSERT_NOK(reader.ReadAccess(&read_record));
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/util/trace_replay.cc b/trace_replay/trace_replay.cc
similarity index 56%
rename from util/trace_replay.cc
rename to trace_replay/trace_replay.cc
index 28160b29292..41d98829b31 100644
--- a/util/trace_replay.cc
+++ b/trace_replay/trace_replay.cc
@@ -3,16 +3,17 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/trace_replay.h"
+#include "trace_replay/trace_replay.h"
 
 #include <chrono>
 #include <sstream>
 #include <thread>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
 #include "util/coding.h"
 #include "util/string_util.h"
+#include "util/threadpool_imp.h"
 
 namespace rocksdb {
 
@@ -31,6 +32,30 @@ void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
 }
 }  // namespace
 
+void TracerHelper::EncodeTrace(const Trace& trace, std::string* encoded_trace) {
+  assert(encoded_trace);
+  PutFixed64(encoded_trace, trace.ts);
+  encoded_trace->push_back(trace.type);
+  PutFixed32(encoded_trace, static_cast<uint32_t>(trace.payload.size()));
+  encoded_trace->append(trace.payload);
+}
+
+Status TracerHelper::DecodeTrace(const std::string& encoded_trace,
+                                 Trace* trace) {
+  assert(trace != nullptr);
+  Slice enc_slice = Slice(encoded_trace);
+  if (!GetFixed64(&enc_slice, &trace->ts)) {
+    return Status::Incomplete("Decode trace string failed");
+  }
+  if (enc_slice.size() < kTraceTypeSize + kTracePayloadLengthSize) {
+    return Status::Incomplete("Decode trace string failed");
+  }
+  trace->type = static_cast<TraceType>(enc_slice[0]);
+  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
+  trace->payload = enc_slice.ToString();
+  return Status::OK();
+}
+
 Tracer::Tracer(Env* env, const TraceOptions& trace_options,
                std::unique_ptr<TraceWriter>&& trace_writer)
     : env_(env),
@@ -138,10 +163,7 @@ Status Tracer::WriteFooter() {
 
 Status Tracer::WriteTrace(const Trace& trace) {
   std::string encoded_trace;
-  PutFixed64(&encoded_trace, trace.ts);
-  encoded_trace.push_back(trace.type);
-  PutFixed32(&encoded_trace, static_cast<uint32_t>(trace.payload.size()));
-  encoded_trace.append(trace.payload);
+  TracerHelper::EncodeTrace(trace, &encoded_trace);
   return trace_writer_->Write(Slice(encoded_trace));
 }
 
@@ -152,13 +174,26 @@ Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
     : trace_reader_(std::move(reader)) {
   assert(db != nullptr);
   db_ = static_cast<DBImpl*>(db->GetRootDB());
+  env_ = Env::Default();
   for (ColumnFamilyHandle* cfh : handles) {
     cf_map_[cfh->GetID()] = cfh;
   }
+  fast_forward_ = 1;
 }
 
 Replayer::~Replayer() { trace_reader_.reset(); }
 
+Status Replayer::SetFastForward(uint32_t fast_forward) {
+  Status s;
+  if (fast_forward < 1) {
+    s = Status::InvalidArgument("Wrong fast forward speed!");
+  } else {
+    fast_forward_ = fast_forward;
+    s = Status::OK();
+  }
+  return s;
+}
+
 Status Replayer::Replay() {
   Status s;
   Trace header;
@@ -182,7 +217,8 @@ Status Replayer::Replay() {
     }
 
     std::this_thread::sleep_until(
-        replay_epoch + std::chrono::microseconds(trace.ts - header.ts));
+        replay_epoch +
+        std::chrono::microseconds((trace.ts - header.ts) / fast_forward_));
     if (trace.type == kTraceWrite) {
       WriteBatch batch(trace.payload);
       db_->Write(woptions, &batch);
@@ -251,6 +287,83 @@ Status Replayer::Replay() {
   return s;
 }
 
+// The trace can be replayed with multithread by configurnge the number of
+// threads in the thread pool. Trace records are read from the trace file
+// sequentially and the corresponding queries are scheduled in the task
+// queue based on the timestamp. Currently, we support Write_batch (Put,
+// Delete, SingleDelete, DeleteRange), Get, Iterator (Seek and SeekForPrev).
+Status Replayer::MultiThreadReplay(uint32_t threads_num) {
+  Status s;
+  Trace header;
+  s = ReadHeader(&header);
+  if (!s.ok()) {
+    return s;
+  }
+
+  ThreadPoolImpl thread_pool;
+  thread_pool.SetHostEnv(env_);
+
+  if (threads_num > 1) {
+    thread_pool.SetBackgroundThreads(static_cast<int>(threads_num));
+  } else {
+    thread_pool.SetBackgroundThreads(1);
+  }
+
+  std::chrono::system_clock::time_point replay_epoch =
+      std::chrono::system_clock::now();
+  WriteOptions woptions;
+  ReadOptions roptions;
+  uint64_t ops = 0;
+  while (s.ok()) {
+    std::unique_ptr<ReplayerWorkerArg> ra(new ReplayerWorkerArg);
+    ra->db = db_;
+    s = ReadTrace(&(ra->trace_entry));
+    if (!s.ok()) {
+      break;
+    }
+    ra->woptions = woptions;
+    ra->roptions = roptions;
+
+    std::this_thread::sleep_until(
+        replay_epoch + std::chrono::microseconds(
+                           (ra->trace_entry.ts - header.ts) / fast_forward_));
+    if (ra->trace_entry.type == kTraceWrite) {
+      thread_pool.Schedule(&Replayer::BGWorkWriteBatch, ra.release(), nullptr,
+                           nullptr);
+      ops++;
+    } else if (ra->trace_entry.type == kTraceGet) {
+      thread_pool.Schedule(&Replayer::BGWorkGet, ra.release(), nullptr,
+                           nullptr);
+      ops++;
+    } else if (ra->trace_entry.type == kTraceIteratorSeek) {
+      thread_pool.Schedule(&Replayer::BGWorkIterSeek, ra.release(), nullptr,
+                           nullptr);
+      ops++;
+    } else if (ra->trace_entry.type == kTraceIteratorSeekForPrev) {
+      thread_pool.Schedule(&Replayer::BGWorkIterSeekForPrev, ra.release(),
+                           nullptr, nullptr);
+      ops++;
+    } else if (ra->trace_entry.type == kTraceEnd) {
+      // Do nothing for now.
+      // TODO: Add some validations later.
+      break;
+    } else {
+      // Other trace entry types that are not implemented for replay.
+      // To finish the replay, we continue the process.
+      continue;
+    }
+  }
+
+  if (s.IsIncomplete()) {
+    // Reaching eof returns Incomplete status at the moment.
+    // Could happen when killing a process without calling EndTrace() API.
+    // TODO: Add better error handling.
+    s = Status::OK();
+  }
+  thread_pool.JoinAllThreads();
+  return s;
+}
+
 Status Replayer::ReadHeader(Trace* header) {
   assert(header != nullptr);
   Status s = ReadTrace(header);
@@ -288,13 +401,85 @@ Status Replayer::ReadTrace(Trace* trace) {
   if (!s.ok()) {
     return s;
   }
+  return TracerHelper::DecodeTrace(encoded_trace, trace);
+}
 
-  Slice enc_slice = Slice(encoded_trace);
-  GetFixed64(&enc_slice, &trace->ts);
-  trace->type = static_cast<TraceType>(enc_slice[0]);
-  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
-  trace->payload = enc_slice.ToString();
-  return s;
+void Replayer::BGWorkGet(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
+      ra->cf_map);
+  uint32_t cf_id = 0;
+  Slice key;
+  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
+  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
+    return;
+  }
+
+  std::string value;
+  if (cf_id == 0) {
+    ra->db->Get(ra->roptions, key, &value);
+  } else {
+    ra->db->Get(ra->roptions, (*cf_map)[cf_id], key, &value);
+  }
+
+  return;
+}
+
+void Replayer::BGWorkWriteBatch(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  WriteBatch batch(ra->trace_entry.payload);
+  ra->db->Write(ra->woptions, &batch);
+  return;
+}
+
+void Replayer::BGWorkIterSeek(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
+      ra->cf_map);
+  uint32_t cf_id = 0;
+  Slice key;
+  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
+  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
+    return;
+  }
+
+  std::string value;
+  Iterator* single_iter = nullptr;
+  if (cf_id == 0) {
+    single_iter = ra->db->NewIterator(ra->roptions);
+  } else {
+    single_iter = ra->db->NewIterator(ra->roptions, (*cf_map)[cf_id]);
+  }
+  single_iter->Seek(key);
+  delete single_iter;
+  return;
+}
+
+void Replayer::BGWorkIterSeekForPrev(void* arg) {
+  std::unique_ptr<ReplayerWorkerArg> ra(
+      reinterpret_cast<ReplayerWorkerArg*>(arg));
+  auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
+      ra->cf_map);
+  uint32_t cf_id = 0;
+  Slice key;
+  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
+  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
+    return;
+  }
+
+  std::string value;
+  Iterator* single_iter = nullptr;
+  if (cf_id == 0) {
+    single_iter = ra->db->NewIterator(ra->roptions);
+  } else {
+    single_iter = ra->db->NewIterator(ra->roptions, (*cf_map)[cf_id]);
+  }
+  single_iter->SeekForPrev(key);
+  delete single_iter;
+  return;
 }
 
 }  // namespace rocksdb
diff --git a/trace_replay/trace_replay.h b/trace_replay/trace_replay.h
new file mode 100644
index 00000000000..776a1e0ca18
--- /dev/null
+++ b/trace_replay/trace_replay.h
@@ -0,0 +1,189 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/trace_reader_writer.h"
+
+namespace rocksdb {
+
+// This file contains Tracer and Replayer classes that enable capturing and
+// replaying RocksDB traces.
+
+class ColumnFamilyHandle;
+class ColumnFamilyData;
+class DB;
+class DBImpl;
+class Slice;
+class WriteBatch;
+
+extern const std::string kTraceMagic;
+const unsigned int kTraceTimestampSize = 8;
+const unsigned int kTraceTypeSize = 1;
+const unsigned int kTracePayloadLengthSize = 4;
+const unsigned int kTraceMetadataSize =
+    kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
+
+// Supported Trace types.
+enum TraceType : char {
+  kTraceBegin = 1,
+  kTraceEnd = 2,
+  kTraceWrite = 3,
+  kTraceGet = 4,
+  kTraceIteratorSeek = 5,
+  kTraceIteratorSeekForPrev = 6,
+  // Block cache related types.
+  kBlockTraceIndexBlock = 7,
+  kBlockTraceFilterBlock = 8,
+  kBlockTraceDataBlock = 9,
+  kBlockTraceUncompressionDictBlock = 10,
+  kBlockTraceRangeDeletionBlock = 11,
+  // All trace types should be added before kTraceMax
+  kTraceMax,
+};
+
+// TODO: This should also be made part of public interface to help users build
+// custom TracerReaders and TraceWriters.
+//
+// The data structure that defines a single trace.
+struct Trace {
+  uint64_t ts;  // timestamp
+  TraceType type;
+  std::string payload;
+
+  void reset() {
+    ts = 0;
+    type = kTraceMax;
+    payload.clear();
+  }
+};
+
+class TracerHelper {
+ public:
+  // Encode a trace object into the given string.
+  static void EncodeTrace(const Trace& trace, std::string* encoded_trace);
+
+  // Decode a string into the given trace object.
+  static Status DecodeTrace(const std::string& encoded_trace, Trace* trace);
+};
+
+// Tracer captures all RocksDB operations using a user-provided TraceWriter.
+// Every RocksDB operation is written as a single trace. Each trace will have a
+// timestamp and type, followed by the trace payload.
+class Tracer {
+ public:
+  Tracer(Env* env, const TraceOptions& trace_options,
+         std::unique_ptr<TraceWriter>&& trace_writer);
+  ~Tracer();
+
+  // Trace all write operations -- Put, Merge, Delete, SingleDelete, Write
+  Status Write(WriteBatch* write_batch);
+
+  // Trace Get operations.
+  Status Get(ColumnFamilyHandle* cfname, const Slice& key);
+
+  // Trace Iterators.
+  Status IteratorSeek(const uint32_t& cf_id, const Slice& key);
+  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+
+  // Returns true if the trace is over the configured max trace file limit.
+  // False otherwise.
+  bool IsTraceFileOverMax();
+
+  // Writes a trace footer at the end of the tracing
+  Status Close();
+
+ private:
+  // Write a trace header at the beginning, typically on initiating a trace,
+  // with some metadata like a magic number, trace version, RocksDB version, and
+  // trace format.
+  Status WriteHeader();
+
+  // Write a trace footer, typically on ending a trace, with some metadata.
+  Status WriteFooter();
+
+  // Write a single trace using the provided TraceWriter to the underlying
+  // system, say, a filesystem or a streaming service.
+  Status WriteTrace(const Trace& trace);
+
+  // Helps in filtering and sampling of traces.
+  // Returns true if a trace should be skipped, false otherwise.
+  bool ShouldSkipTrace(const TraceType& type);
+
+  Env* env_;
+  TraceOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
+  uint64_t trace_request_count_;
+};
+
+// Replayer helps to replay the captured RocksDB operations, using a user
+// provided TraceReader.
+// The Replayer is instantiated via db_bench today, on using "replay" benchmark.
+class Replayer {
+ public:
+  Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+           std::unique_ptr<TraceReader>&& reader);
+  ~Replayer();
+
+  // Replay all the traces from the provided trace stream, taking the delay
+  // between the traces into consideration.
+  Status Replay();
+
+  // Replay the provide trace stream, which is the same as Replay(), with
+  // multi-threads. Queries are scheduled in the thread pool job queue.
+  // User can set the number of threads in the thread pool.
+  Status MultiThreadReplay(uint32_t threads_num);
+
+  // Enables fast forwarding a replay by reducing the delay between the ingested
+  // traces.
+  // fast_forward : Rate of replay speedup.
+  //   If 1, replay the operations at the same rate as in the trace stream.
+  //   If > 1, speed up the replay by this amount.
+  Status SetFastForward(uint32_t fast_forward);
+
+ private:
+  Status ReadHeader(Trace* header);
+  Status ReadFooter(Trace* footer);
+  Status ReadTrace(Trace* trace);
+
+  // The background function for MultiThreadReplay to execute Get query
+  // based on the trace records.
+  static void BGWorkGet(void* arg);
+
+  // The background function for MultiThreadReplay to execute WriteBatch
+  // (Put, Delete, SingleDelete, DeleteRange) based on the trace records.
+  static void BGWorkWriteBatch(void* arg);
+
+  // The background function for MultiThreadReplay to execute Iterator (Seek)
+  // based on the trace records.
+  static void BGWorkIterSeek(void* arg);
+
+  // The background function for MultiThreadReplay to execute Iterator
+  // (SeekForPrev) based on the trace records.
+  static void BGWorkIterSeekForPrev(void* arg);
+
+  DBImpl* db_;
+  Env* env_;
+  std::unique_ptr<TraceReader> trace_reader_;
+  std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
+  uint32_t fast_forward_;
+};
+
+// The passin arg of MultiThreadRepkay for each trace record.
+struct ReplayerWorkerArg {
+  DB* db;
+  Trace trace_entry;
+  std::unordered_map<uint32_t, ColumnFamilyHandle*>* cf_map;
+  WriteOptions woptions;
+  ReadOptions roptions;
+};
+
+}  // namespace rocksdb
diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h
index 2201b487770..dbff9c8109e 100644
--- a/util/aligned_buffer.h
+++ b/util/aligned_buffer.h
@@ -13,21 +13,47 @@
 
 namespace rocksdb {
 
+// This file contains utilities to handle the alignment of pages and buffers.
+
+// Truncate to a multiple of page_size, which is also a page boundary. This
+// helps to figuring out the right alignment.
+// Example:
+//   TruncateToPageBoundary(5000, 4096)  => 4096
+//   TruncateToPageBoundary(10000, 4096) => 8192
 inline size_t TruncateToPageBoundary(size_t page_size, size_t s) {
   s -= (s & (page_size - 1));
   assert((s % page_size) == 0);
   return s;
 }
 
+// Round up x to a multiple of y.
+// Example:
+//   Roundup(13, 5)   => 15
+//   Roundup(201, 16) => 208
 inline size_t Roundup(size_t x, size_t y) {
   return ((x + y - 1) / y) * y;
 }
 
+// Round down x to a multiple of y.
+// Example:
+//   Rounddown(13, 5)   => 10
+//   Rounddown(201, 16) => 192
 inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; }
 
-// This class is to manage an aligned user
-// allocated buffer for direct I/O purposes
-// though can be used for any purpose.
+// AlignedBuffer manages a buffer by taking alignment into consideration, and
+// aligns the buffer start and end positions. It is mainly used for direct I/O,
+// though it can be used other purposes as well.
+// It also supports expanding the managed buffer, and copying whole or part of
+// the data from old buffer into the new expanded buffer. Such a copy especially
+// helps in cases avoiding an IO to re-fetch the data from disk.
+//
+// Example:
+//   AlignedBuffer buf;
+//   buf.Alignment(alignment);
+//   buf.AllocateNewBuffer(user_requested_buf_size);
+//   ...
+//   buf.AllocateNewBuffer(2*user_requested_buf_size, /*copy_data*/ true,
+//                         copy_offset, copy_len);
 class AlignedBuffer {
   size_t alignment_;
   std::unique_ptr<char[]> buf_;
@@ -96,12 +122,21 @@ class AlignedBuffer {
     alignment_ = alignment;
   }
 
-  // Allocates a new buffer and sets bufstart_ to the aligned first byte.
+  // Allocates a new buffer and sets the start position to the first aligned
+  // byte.
+  //
   // requested_capacity: requested new buffer capacity. This capacity will be
   //     rounded up based on alignment.
-  // copy_data: Copy data from old buffer to new buffer.
+  // copy_data: Copy data from old buffer to new buffer. If copy_offset and
+  //     copy_len are not passed in and the new requested capacity is bigger
+  //     than the existing buffer's capacity, the data in the exising buffer is
+  //     fully copied over to the new buffer.
   // copy_offset: Copy data from this offset in old buffer.
   // copy_len: Number of bytes to copy.
+  //
+  // The function does nothing if the new requested_capacity is smaller than
+  // the current buffer capacity and copy_data is true i.e. the old buffer is
+  // retained as is.
   void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false,
                          uint64_t copy_offset = 0, size_t copy_len = 0) {
     assert(alignment_ > 0);
@@ -110,7 +145,7 @@ class AlignedBuffer {
     copy_len = copy_len > 0 ? copy_len : cursize_;
     if (copy_data && requested_capacity < copy_len) {
       // If we are downsizing to a capacity that is smaller than the current
-      // data in the buffer. Ignore the request.
+      // data in the buffer -- Ignore the request.
       return;
     }
 
@@ -132,8 +167,15 @@ class AlignedBuffer {
     capacity_ = new_capacity;
     buf_.reset(new_buf);
   }
-  // Used for write
-  // Returns the number of bytes appended
+
+  // Append to the buffer.
+  //
+  // src         : source to copy the data from.
+  // append_size : number of bytes to copy from src.
+  // Returns the number of bytes appended.
+  //
+  // If append_size is more than the remaining buffer size only the
+  // remaining-size worth of bytes are copied.
   size_t Append(const char* src, size_t append_size) {
     size_t buffer_remaining = capacity_ - cursize_;
     size_t to_copy = std::min(append_size, buffer_remaining);
@@ -145,6 +187,12 @@ class AlignedBuffer {
     return to_copy;
   }
 
+  // Read from the buffer.
+  //
+  // dest      : destination buffer to copy the data to.
+  // offset    : the buffer offset to start reading from.
+  // read_size : the number of bytes to copy from the buffer to dest.
+  // Returns the number of bytes read/copied to dest.
   size_t Read(char* dest, size_t offset, size_t read_size) const {
     assert(offset < cursize_);
 
@@ -158,7 +206,7 @@ class AlignedBuffer {
     return to_read;
   }
 
-  /// Pad to alignment
+  // Pad to the end of alignment with "padding"
   void PadToAlignmentWith(int padding) {
     size_t total_size = Roundup(cursize_, alignment_);
     size_t pad_size = total_size - cursize_;
@@ -176,7 +224,7 @@ class AlignedBuffer {
     cursize_ += pad_size;
   }
 
-  // After a partial flush move the tail to the beginning of the buffer
+  // After a partial flush move the tail to the beginning of the buffer.
   void RefitTail(size_t tail_offset, size_t tail_size) {
     if (tail_size > 0) {
       memmove(bufstart_, bufstart_ + tail_offset, tail_size);
@@ -184,7 +232,11 @@ class AlignedBuffer {
     cursize_ = tail_size;
   }
 
-  // Returns place to start writing
+  // Returns a place to start appending.
+  // WARNING: Note that it is possible to write past the end of the buffer if
+  // the buffer is modified without using the write APIs or encapsulation
+  // offered by AlignedBuffer. It is up to the user to guard against such
+  // errors.
   char* Destination() {
     return bufstart_ + cursize_;
   }
diff --git a/util/autovector.h b/util/autovector.h
index 5843fa8a119..010d8689586 100644
--- a/util/autovector.h
+++ b/util/autovector.h
@@ -120,27 +120,20 @@ class autovector {
     }
 
     // -- Reference
-    reference operator*() {
+    reference operator*() const {
       assert(vect_->size() >= index_);
       return (*vect_)[index_];
     }
 
-    const_reference operator*() const {
-      assert(vect_->size() >= index_);
-      return (*vect_)[index_];
-    }
-
-    pointer operator->() {
+    pointer operator->() const {
       assert(vect_->size() >= index_);
       return &(*vect_)[index_];
     }
 
-    const_pointer operator->() const {
-      assert(vect_->size() >= index_);
-      return &(*vect_)[index_];
+    reference operator[](difference_type len) const {
+      return *(*this + len);
     }
 
-
     // -- Logical Operators
     bool operator==(const self_type& other) const {
       assert(vect_ == other.vect_);
diff --git a/util/autovector_test.cc b/util/autovector_test.cc
index 13299669cd4..6b1b36e8d18 100644
--- a/util/autovector_test.cc
+++ b/util/autovector_test.cc
@@ -9,10 +9,10 @@
 #include <utility>
 
 #include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/autovector.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 using std::cout;
 using std::endl;
diff --git a/util/bloom.cc b/util/bloom.cc
deleted file mode 100644
index 1da4f2aa428..00000000000
--- a/util/bloom.cc
+++ /dev/null
@@ -1,419 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "rocksdb/filter_policy.h"
-
-#include "rocksdb/slice.h"
-#include "table/block_based_filter_block.h"
-#include "table/full_filter_bits_builder.h"
-#include "table/full_filter_block.h"
-#include "util/coding.h"
-#include "util/hash.h"
-
-namespace rocksdb {
-
-class BlockBasedFilterBlockBuilder;
-class FullFilterBlockBuilder;
-
-FullFilterBitsBuilder::FullFilterBitsBuilder(const size_t bits_per_key,
-                                             const size_t num_probes)
-    : bits_per_key_(bits_per_key), num_probes_(num_probes) {
-  assert(bits_per_key_);
-  }
-
-  FullFilterBitsBuilder::~FullFilterBitsBuilder() {}
-
-  void FullFilterBitsBuilder::AddKey(const Slice& key) {
-    uint32_t hash = BloomHash(key);
-    if (hash_entries_.size() == 0 || hash != hash_entries_.back()) {
-      hash_entries_.push_back(hash);
-    }
-  }
-
-  Slice FullFilterBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
-    uint32_t total_bits, num_lines;
-    char* data = ReserveSpace(static_cast<int>(hash_entries_.size()),
-                              &total_bits, &num_lines);
-    assert(data);
-
-    if (total_bits != 0 && num_lines != 0) {
-      for (auto h : hash_entries_) {
-        AddHash(h, data, num_lines, total_bits);
-      }
-    }
-    data[total_bits/8] = static_cast<char>(num_probes_);
-    EncodeFixed32(data + total_bits/8 + 1, static_cast<uint32_t>(num_lines));
-
-    const char* const_data = data;
-    buf->reset(const_data);
-    hash_entries_.clear();
-
-    return Slice(data, total_bits / 8 + 5);
-  }
-
-uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
-  uint32_t num_lines =
-      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
-
-  // Make num_lines an odd number to make sure more bits are involved
-  // when determining which block.
-  if (num_lines % 2 == 0) {
-    num_lines++;
-  }
-  return num_lines * (CACHE_LINE_SIZE * 8);
-}
-
-uint32_t FullFilterBitsBuilder::CalculateSpace(const int num_entry,
-                                               uint32_t* total_bits,
-                                               uint32_t* num_lines) {
-  assert(bits_per_key_);
-  if (num_entry != 0) {
-    uint32_t total_bits_tmp = num_entry * static_cast<uint32_t>(bits_per_key_);
-
-    *total_bits = GetTotalBitsForLocality(total_bits_tmp);
-    *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
-    assert(*total_bits > 0 && *total_bits % 8 == 0);
-  } else {
-    // filter is empty, just leave space for metadata
-    *total_bits = 0;
-    *num_lines = 0;
-  }
-
-  // Reserve space for Filter
-  uint32_t sz = *total_bits / 8;
-  sz += 5;  // 4 bytes for num_lines, 1 byte for num_probes
-  return sz;
-}
-
-char* FullFilterBitsBuilder::ReserveSpace(const int num_entry,
-                                          uint32_t* total_bits,
-                                          uint32_t* num_lines) {
-  uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
-  char* data = new char[sz];
-  memset(data, 0, sz);
-  return data;
-}
-
-int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
-  assert(bits_per_key_);
-  assert(space > 0);
-  uint32_t dont_care1, dont_care2;
-  int high = (int) (space * 8 / bits_per_key_ + 1);
-  int low = 1;
-  int n = high;
-  for (; n >= low; n--) {
-    uint32_t sz = CalculateSpace(n, &dont_care1, &dont_care2);
-    if (sz <= space) {
-      break;
-    }
-  }
-  assert(n < high);  // High should be an overestimation
-  return n;
-}
-
-inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
-    uint32_t num_lines, uint32_t total_bits) {
-#ifdef NDEBUG
-  (void)total_bits;
-#endif
-  assert(num_lines > 0 && total_bits > 0);
-
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8);
-
-  for (uint32_t i = 0; i < num_probes_; ++i) {
-    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-    // to a simple operation by compiler.
-    const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-    data[bitpos / 8] |= (1 << (bitpos % 8));
-
-    h += delta;
-  }
-}
-
-namespace {
-class FullFilterBitsReader : public FilterBitsReader {
- public:
-  explicit FullFilterBitsReader(const Slice& contents)
-      : data_(const_cast<char*>(contents.data())),
-        data_len_(static_cast<uint32_t>(contents.size())),
-        num_probes_(0),
-        num_lines_(0),
-        log2_cache_line_size_(0) {
-    assert(data_);
-    GetFilterMeta(contents, &num_probes_, &num_lines_);
-    // Sanitize broken parameter
-    if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
-      num_lines_ = 0;
-      num_probes_ = 0;
-    } else if (num_lines_ != 0) {
-      while (true) {
-        uint32_t num_lines_at_curr_cache_size =
-            (data_len_ - 5) >> log2_cache_line_size_;
-        if (num_lines_at_curr_cache_size == 0) {
-          // The cache line size seems not a power of two. It's not supported
-          // and indicates a corruption so disable using this filter.
-          assert(false);
-          num_lines_ = 0;
-          num_probes_ = 0;
-          break;
-        }
-        if (num_lines_at_curr_cache_size == num_lines_) {
-          break;
-        }
-        ++log2_cache_line_size_;
-      }
-    }
-  }
-
-  ~FullFilterBitsReader() override {}
-
-  bool MayMatch(const Slice& entry) override {
-    if (data_len_ <= 5) {   // remain same with original filter
-      return false;
-    }
-    // Other Error params, including a broken filter, regarded as match
-    if (num_probes_ == 0 || num_lines_ == 0) return true;
-    uint32_t hash = BloomHash(entry);
-    uint32_t bit_offset;
-    FilterPrepare(hash, Slice(data_, data_len_), num_lines_, &bit_offset);
-    return HashMayMatch(hash, Slice(data_, data_len_), num_probes_, bit_offset);
-  }
-
-  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
-    if (data_len_ <= 5) {  // remain same with original filter
-      for (int i = 0; i < num_keys; ++i) {
-        may_match[i] = false;
-      }
-      return;
-    }
-    for (int i = 0; i < num_keys; ++i) {
-      may_match[i] = true;
-    }
-    // Other Error params, including a broken filter, regarded as match
-    if (num_probes_ == 0 || num_lines_ == 0) return;
-    uint32_t hashes[MultiGetContext::MAX_BATCH_SIZE];
-    uint32_t bit_offsets[MultiGetContext::MAX_BATCH_SIZE];
-    for (int i = 0; i < num_keys; ++i) {
-      hashes[i] = BloomHash(*keys[i]);
-      FilterPrepare(hashes[i], Slice(data_, data_len_), num_lines_,
-                    &bit_offsets[i]);
-    }
-
-    for (int i = 0; i < num_keys; ++i) {
-      if (!HashMayMatch(hashes[i], Slice(data_, data_len_), num_probes_,
-                        bit_offsets[i])) {
-        may_match[i] = false;
-      }
-    }
-  }
-
- private:
-  // Filter meta data
-  char* data_;
-  uint32_t data_len_;
-  size_t num_probes_;
-  uint32_t num_lines_;
-  uint32_t log2_cache_line_size_;
-
-  // Get num_probes, and num_lines from filter
-  // If filter format broken, set both to 0.
-  void GetFilterMeta(const Slice& filter, size_t* num_probes,
-                             uint32_t* num_lines);
-
-  // "filter" contains the data appended by a preceding call to
-  // FilterBitsBuilder::Finish. This method must return true if the key was
-  // passed to FilterBitsBuilder::AddKey. This method may return true or false
-  // if the key was not on the list, but it should aim to return false with a
-  // high probability.
-  //
-  // hash: target to be checked
-  // filter: the whole filter, including meta data bytes
-  // num_probes: number of probes, read before hand
-  // num_lines: filter metadata, read before hand
-  // Before calling this function, need to ensure the input meta data
-  // is valid.
-  bool HashMayMatch(const uint32_t& hash, const Slice& filter,
-                    const size_t& num_probes, const uint32_t& bit_offset);
-
-  void FilterPrepare(const uint32_t& hash, const Slice& filter,
-                     const uint32_t& num_lines, uint32_t* bit_offset);
-
-  // No Copy allowed
-  FullFilterBitsReader(const FullFilterBitsReader&);
-  void operator=(const FullFilterBitsReader&);
-};
-
-void FullFilterBitsReader::GetFilterMeta(const Slice& filter,
-    size_t* num_probes, uint32_t* num_lines) {
-  uint32_t len = static_cast<uint32_t>(filter.size());
-  if (len <= 5) {
-    // filter is empty or broken
-    *num_probes = 0;
-    *num_lines = 0;
-    return;
-  }
-
-  *num_probes = filter.data()[len - 5];
-  *num_lines = DecodeFixed32(filter.data() + len - 4);
-}
-
-void FullFilterBitsReader::FilterPrepare(const uint32_t& hash,
-                                         const Slice& filter,
-                                         const uint32_t& num_lines,
-                                         uint32_t* bit_offset) {
-  uint32_t len = static_cast<uint32_t>(filter.size());
-  if (len <= 5) return;  // remain the same with original filter
-
-  // It is ensured the params are valid before calling it
-  assert(num_lines != 0 && (len - 5) % num_lines == 0);
-
-  uint32_t h = hash;
-  // Left shift by an extra 3 to convert bytes to bits
-  uint32_t b = (h % num_lines) << (log2_cache_line_size_ + 3);
-  PREFETCH(&filter.data()[b / 8], 0 /* rw */, 1 /* locality */);
-  PREFETCH(&filter.data()[b / 8 + (1 << log2_cache_line_size_) - 1],
-      0 /* rw */, 1 /* locality */);
-  *bit_offset = b;
-}
-
-bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
-                                        const Slice& filter,
-                                        const size_t& num_probes,
-                                        const uint32_t& bit_offset) {
-  uint32_t len = static_cast<uint32_t>(filter.size());
-  if (len <= 5) return false;  // remain the same with original filter
-
-  // It is ensured the params are valid before calling it
-  assert(num_probes != 0);
-  const char* data = filter.data();
-
-  uint32_t h = hash;
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-
-  for (uint32_t i = 0; i < num_probes; ++i) {
-    // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-    //  to a simple and operation by compiler.
-    const uint32_t bitpos =
-        bit_offset + (h & ((1 << (log2_cache_line_size_ + 3)) - 1));
-    if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
-      return false;
-    }
-
-    h += delta;
-  }
-
-  return true;
-}
-
-// An implementation of filter policy
-class BloomFilterPolicy : public FilterPolicy {
- public:
-  explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder)
-      : bits_per_key_(bits_per_key), hash_func_(BloomHash),
-        use_block_based_builder_(use_block_based_builder) {
-    initialize();
-  }
-
-  ~BloomFilterPolicy() override {}
-
-  const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; }
-
-  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
-    // Compute bloom filter size (in both bits and bytes)
-    size_t bits = n * bits_per_key_;
-
-    // For small n, we can see a very high false positive rate.  Fix it
-    // by enforcing a minimum bloom filter length.
-    if (bits < 64) bits = 64;
-
-    size_t bytes = (bits + 7) / 8;
-    bits = bytes * 8;
-
-    const size_t init_size = dst->size();
-    dst->resize(init_size + bytes, 0);
-    dst->push_back(static_cast<char>(num_probes_));  // Remember # of probes
-    char* array = &(*dst)[init_size];
-    for (size_t i = 0; i < (size_t)n; i++) {
-      // Use double-hashing to generate a sequence of hash values.
-      // See analysis in [Kirsch,Mitzenmacher 2006].
-      uint32_t h = hash_func_(keys[i]);
-      const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-      for (size_t j = 0; j < num_probes_; j++) {
-        const uint32_t bitpos = h % bits;
-        array[bitpos/8] |= (1 << (bitpos % 8));
-        h += delta;
-      }
-    }
-  }
-
-  bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
-    const size_t len = bloom_filter.size();
-    if (len < 2) return false;
-
-    const char* array = bloom_filter.data();
-    const size_t bits = (len - 1) * 8;
-
-    // Use the encoded k so that we can read filters generated by
-    // bloom filters created using different parameters.
-    const size_t k = array[len-1];
-    if (k > 30) {
-      // Reserved for potentially new encodings for short bloom filters.
-      // Consider it a match.
-      return true;
-    }
-
-    uint32_t h = hash_func_(key);
-    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-    for (size_t j = 0; j < k; j++) {
-      const uint32_t bitpos = h % bits;
-      if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
-      h += delta;
-    }
-    return true;
-  }
-
-  FilterBitsBuilder* GetFilterBitsBuilder() const override {
-    if (use_block_based_builder_) {
-      return nullptr;
-    }
-
-    return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
-  }
-
-  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
-    return new FullFilterBitsReader(contents);
-  }
-
-  // If choose to use block based builder
-  bool UseBlockBasedBuilder() { return use_block_based_builder_; }
-
- private:
-  size_t bits_per_key_;
-  size_t num_probes_;
-  uint32_t (*hash_func_)(const Slice& key);
-
-  const bool use_block_based_builder_;
-
-  void initialize() {
-    // We intentionally round down to reduce probing cost a little bit
-    num_probes_ = static_cast<size_t>(bits_per_key_ * 0.69);  // 0.69 =~ ln(2)
-    if (num_probes_ < 1) num_probes_ = 1;
-    if (num_probes_ > 30) num_probes_ = 30;
-  }
-};
-
-}  // namespace
-
-const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
-                                         bool use_block_based_builder) {
-  return new BloomFilterPolicy(bits_per_key, use_block_based_builder);
-}
-
-}  // namespace rocksdb
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
new file mode 100644
index 00000000000..73575b07c02
--- /dev/null
+++ b/util/bloom_impl.h
@@ -0,0 +1,388 @@
+//  Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Implementation details of various Bloom filter implementations used in
+// RocksDB. (DynamicBloom is in a separate file for now because it
+// supports concurrent write.)
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+
+#include "rocksdb/slice.h"
+#include "util/hash.h"
+
+#ifdef HAVE_AVX2
+#include <immintrin.h>
+#endif
+
+namespace rocksdb {
+
+// A fast, flexible, and accurate cache-local Bloom implementation with
+// SIMD-optimized query performance (currently using AVX2 on Intel). Write
+// performance and non-SIMD read are very good, benefiting from fastrange32
+// used in place of % and single-cycle multiplication on recent processors.
+//
+// Most other SIMD Bloom implementations sacrifice flexibility and/or
+// accuracy by requiring num_probes to be a power of two and restricting
+// where each probe can occur in a cache line. This implementation sacrifices
+// SIMD-optimization for add (might still be possible, especially with AVX512)
+// in favor of allowing any num_probes, not crossing cache line boundary,
+// and accuracy close to theoretical best accuracy for a cache-local Bloom.
+// E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket
+// (Intel cache line size) is 0.9535% FP rate. This implementation yields
+// about 0.957%. (Compare to LegacyLocalityBloomImpl<false> at 1.138%, or
+// about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.)
+//
+// This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or
+// a 64-bit hash (split into two uint32s). With many millions of keys, the
+// false positive rate associated with using a 32-bit hash can dominate the
+// false positive rate of the underlying filter. At 10 bits/key setting, the
+// inflection point is about 40 million keys, so 32-bit hash is a bad idea
+// with 10s of millions of keys or more.
+//
+// Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange
+// to pick a cache line, which can be faster than 64-bit in some cases.
+// This only hurts accuracy as you get into 10s of GB for a single filter,
+// and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to
+// 64-bit fastrange if you need filters so big. ;)
+//
+// Using only a 32-bit input hash within each cache line has negligible
+// impact for any reasonable cache line / bucket size, for arbitrary filter
+// size, and potentially saves intermediate data size in some cases vs.
+// tracking full 64 bits. (Even in an implementation using 64-bit arithmetic
+// to generate indices, I might do the same, as a single multiplication
+// suffices to generate a sufficiently mixed 64 bits from 32 bits.)
+//
+// This implementation is currently tied to Intel cache line size, 64 bytes ==
+// 512 bits. If there's sufficient demand for other cache line sizes, this is
+// a pretty good implementation to extend, but slight performance enhancements
+// are possible with an alternate implementation (probably not very compatible
+// with SIMD):
+// (1) Use rotation in addition to multiplication for remixing
+// (like murmur hash). (Using multiplication alone *slightly* hurts accuracy
+// because lower bits never depend on original upper bits.)
+// (2) Extract more than one bit index from each re-mix. (Only if rotation
+// or similar is part of remix, because otherwise you're making the
+// multiplication-only problem worse.)
+// (3) Re-mix full 64 bit hash, to get maximum number of bit indices per
+// re-mix.
+//
+class FastLocalBloomImpl {
+ public:
+  static inline int ChooseNumProbes(int millibits_per_key) {
+    // Since this implementation can (with AVX2) make up to 8 probes
+    // for the same cost, we pick the most accurate num_probes, based
+    // on actual tests of the implementation. Note that for higher
+    // bits/key, the best choice for cache-local Bloom can be notably
+    // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k.
+    if (millibits_per_key <= 2080) {
+      return 1;
+    } else if (millibits_per_key <= 3580) {
+      return 2;
+    } else if (millibits_per_key <= 5100) {
+      return 3;
+    } else if (millibits_per_key <= 6640) {
+      return 4;
+    } else if (millibits_per_key <= 8300) {
+      return 5;
+    } else if (millibits_per_key <= 10070) {
+      return 6;
+    } else if (millibits_per_key <= 11720) {
+      return 7;
+    } else if (millibits_per_key <= 14001) {
+      // Would be something like <= 13800 but sacrificing *slightly* for
+      // more settings using <= 8 probes.
+      return 8;
+    } else if (millibits_per_key <= 16050) {
+      return 9;
+    } else if (millibits_per_key <= 18300) {
+      return 10;
+    } else if (millibits_per_key <= 22001) {
+      return 11;
+    } else if (millibits_per_key <= 25501) {
+      return 12;
+    } else if (millibits_per_key > 50000) {
+      // Top out at 24 probes (three sets of 8)
+      return 24;
+    } else {
+      // Roughly optimal choices for remaining range
+      // e.g.
+      // 28000 -> 12, 28001 -> 13
+      // 50000 -> 23, 50001 -> 24
+      return (millibits_per_key - 1) / 2000 - 1;
+    }
+  }
+
+  static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
+                             int num_probes, char *data) {
+    uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+    AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
+  }
+
+  static inline void AddHashPrepared(uint32_t h2, int num_probes,
+                                     char *data_at_cache_line) {
+    uint32_t h = h2;
+    for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
+      // 9-bit address within 512 bit cache line
+      int bitpos = h >> (32 - 9);
+      data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7));
+    }
+  }
+
+  static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
+                                 const char *data,
+                                 uint32_t /*out*/ *byte_offset) {
+    uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+    PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
+    PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
+    *byte_offset = bytes_to_cache_line;
+  }
+
+  static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
+                                  int num_probes, const char *data) {
+    uint32_t bytes_to_cache_line = fastrange32(len_bytes >> 6, h1) << 6;
+    return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
+  }
+
+  static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes,
+                                          const char *data_at_cache_line) {
+    uint32_t h = h2;
+#ifdef HAVE_AVX2
+    int rem_probes = num_probes;
+
+    // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18,
+    // etc.} one can insert specialized code for rem_probes <= 2, bypassing
+    // the SIMD code in those cases. There is a detectable but minor overhead
+    // applied to other values of num_probes (when not statically determined),
+    // but smoother performance curve vs. num_probes. But for now, when
+    // in doubt, don't add unnecessary code.
+
+    // Powers of 32-bit golden ratio, mod 2**32.
+    const __m256i multipliers =
+        _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9,
+                          0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749);
+
+    for (;;) {
+      // Eight copies of hash
+      __m256i hash_vector = _mm256_set1_epi32(h);
+
+      // Same effect as repeated multiplication by 0x9e3779b9 thanks to
+      // associativity of multiplication.
+      hash_vector = _mm256_mullo_epi32(hash_vector, multipliers);
+
+      // Now the top 9 bits of each of the eight 32-bit values in
+      // hash_vector are bit addresses for probes within the cache line.
+      // While the platform-independent code uses byte addressing (6 bits
+      // to pick a byte + 3 bits to pick a bit within a byte), here we work
+      // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit
+      // within a word) because that works well with AVX2 and is equivalent
+      // under little-endian.
+
+      // Shift each right by 28 bits to get 4-bit word addresses.
+      const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28);
+
+      // Gather 32-bit values spread over 512 bits by 4-bit address. In
+      // essence, we are dereferencing eight pointers within the cache
+      // line.
+      //
+      // Option 1: AVX2 gather (seems to be a little slow - understandable)
+      // const __m256i value_vector =
+      //     _mm256_i32gather_epi32(static_cast<const int
+      //     *>(data_at_cache_line),
+      //                            word_addresses,
+      //                            /*bytes / i32*/ 4);
+      // END Option 1
+      // Potentially unaligned as we're not *always* cache-aligned -> loadu
+      const __m256i *mm_data =
+          reinterpret_cast<const __m256i *>(data_at_cache_line);
+      __m256i lower = _mm256_loadu_si256(mm_data);
+      __m256i upper = _mm256_loadu_si256(mm_data + 1);
+      // Option 2: AVX512VL permute hack
+      // Only negligibly faster than Option 3, so not yet worth supporting
+      // const __m256i value_vector =
+      //    _mm256_permutex2var_epi32(lower, word_addresses, upper);
+      // END Option 2
+      // Option 3: AVX2 permute+blend hack
+      // Use lowest three bits to order probing values, as if all from same
+      // 256 bit piece.
+      lower = _mm256_permutevar8x32_epi32(lower, word_addresses);
+      upper = _mm256_permutevar8x32_epi32(upper, word_addresses);
+      // Just top 1 bit of address, to select between lower and upper.
+      const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31);
+      // Finally: the next 8 probed 32-bit values, in probing sequence order.
+      const __m256i value_vector =
+          _mm256_blendv_epi8(lower, upper, upper_lower_selector);
+      // END Option 3
+
+      // We might not need to probe all 8, so build a mask for selecting only
+      // what we need. (The k_selector(s) could be pre-computed but that
+      // doesn't seem to make a noticeable performance difference.)
+      const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+      // Subtract rem_probes from each of those constants
+      __m256i k_selector =
+          _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes));
+      // Negative after subtract -> use/select
+      // Keep only high bit (logical shift right each by 31).
+      k_selector = _mm256_srli_epi32(k_selector, 31);
+
+      // Strip off the 4 bit word address (shift left)
+      __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4);
+      // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses.
+      bit_addresses = _mm256_srli_epi32(bit_addresses, 27);
+      // Build a bit mask
+      const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses);
+
+      // Like ((~value_vector) & bit_mask) == 0)
+      bool match = _mm256_testc_si256(value_vector, bit_mask) != 0;
+
+      // This check first so that it's easy for branch predictor to optimize
+      // num_probes <= 8 case, making it free of unpredictable branches.
+      if (rem_probes <= 8) {
+        return match;
+      } else if (!match) {
+        return false;
+      }
+      // otherwise
+      // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power
+      h *= 0xab25f4c1;
+      rem_probes -= 8;
+    }
+#else
+    for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
+      // 9-bit address within 512 bit cache line
+      int bitpos = h >> (32 - 9);
+      if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) {
+        return false;
+      }
+    }
+    return true;
+#endif
+  }
+};
+
+// A legacy Bloom filter implementation with no locality of probes (slow).
+// It uses double hashing to generate a sequence of hash values.
+// Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
+// subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
+//
+// DO NOT REUSE
+//
+class LegacyNoLocalityBloomImpl {
+ public:
+  static inline int ChooseNumProbes(int bits_per_key) {
+    // We intentionally round down to reduce probing cost a little bit
+    int num_probes = static_cast<int>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
+    if (num_probes < 1) num_probes = 1;
+    if (num_probes > 30) num_probes = 30;
+    return num_probes;
+  }
+
+  static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
+                             char *data) {
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (int i = 0; i < num_probes; i++) {
+      const uint32_t bitpos = h % total_bits;
+      data[bitpos / 8] |= (1 << (bitpos % 8));
+      h += delta;
+    }
+  }
+
+  static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
+                                  int num_probes, const char *data) {
+    const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
+    for (int i = 0; i < num_probes; i++) {
+      const uint32_t bitpos = h % total_bits;
+      if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      h += delta;
+    }
+    return true;
+  }
+};
+
+// A legacy Bloom filter implementation with probes local to a single
+// cache line (fast). Because SST files might be transported between
+// platforms, the cache line size is a parameter rather than hard coded.
+// (But if specified as a constant parameter, an optimizing compiler
+// should take advantage of that.)
+//
+// When ExtraRotates is false, this implementation is notably deficient in
+// accuracy. Specifically, it uses double hashing with a 1/512 chance of the
+// increment being zero (when cache line size is 512 bits). Thus, there's a
+// 1/512 chance of probing only one index, which we'd expect to incur about
+// a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
+// https://github.com/facebook/rocksdb/issues/4120
+//
+// DO NOT REUSE
+//
+template <bool ExtraRotates>
+class LegacyLocalityBloomImpl {
+ private:
+  static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
+    uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
+    return offset_h % num_lines;
+  }
+
+ public:
+  static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes,
+                             char *data, int log2_cache_line_bytes) {
+    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+    char *data_at_offset =
+        data + (GetLine(h, num_lines) << log2_cache_line_bytes);
+    const uint32_t delta = (h >> 17) | (h << 15);
+    for (int i = 0; i < num_probes; ++i) {
+      // Mask to bit-within-cache-line address
+      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+      data_at_offset[bitpos / 8] |= (1 << (bitpos % 8));
+      if (ExtraRotates) {
+        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+      }
+      h += delta;
+    }
+  }
+
+  static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
+                                         const char *data,
+                                         uint32_t /*out*/ *byte_offset,
+                                         int log2_cache_line_bytes) {
+    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+    PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
+    PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */,
+             1 /* locality */);
+    *byte_offset = b;
+  }
+
+  static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
+                                  int num_probes, const char *data,
+                                  int log2_cache_line_bytes) {
+    uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
+    return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes);
+  }
+
+  static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
+                                          const char *data_at_offset,
+                                          int log2_cache_line_bytes) {
+    const int log2_cache_line_bits = log2_cache_line_bytes + 3;
+
+    const uint32_t delta = (h >> 17) | (h << 15);
+    for (int i = 0; i < num_probes; ++i) {
+      // Mask to bit-within-cache-line address
+      const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
+      if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
+        return false;
+      }
+      if (ExtraRotates) {
+        h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
+      }
+      h += delta;
+    }
+    return true;
+  }
+};
+
+}  // namespace rocksdb
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 4b25e9b6c6f..6f4d4864607 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -15,15 +15,18 @@ int main() {
 }
 #else
 
+#include <array>
+#include <cmath>
 #include <vector>
 
+#include "logging/logging.h"
+#include "memory/arena.h"
 #include "rocksdb/filter_policy.h"
-#include "table/full_filter_bits_builder.h"
-#include "util/arena.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
-#include "util/logging.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "util/hash.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
@@ -53,23 +56,28 @@ static int NextLength(int length) {
   return length;
 }
 
-class BloomTest : public testing::Test {
+class BlockBasedBloomTest : public testing::Test {
  private:
-  const FilterPolicy* policy_;
+  std::unique_ptr<const FilterPolicy> policy_;
   std::string filter_;
   std::vector<std::string> keys_;
 
  public:
-  BloomTest() : policy_(
-      NewBloomFilterPolicy(FLAGS_bits_per_key)) {}
-
-  ~BloomTest() override { delete policy_; }
+  BlockBasedBloomTest() { ResetPolicy(); }
 
   void Reset() {
     keys_.clear();
     filter_.clear();
   }
 
+  void ResetPolicy(double bits_per_key) {
+    policy_.reset(new BloomFilterPolicy(bits_per_key,
+                                        BloomFilterPolicy::kDeprecatedBlock));
+    Reset();
+  }
+
+  void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); }
+
   void Add(const Slice& s) {
     keys_.push_back(s.ToString());
   }
@@ -90,6 +98,8 @@ class BloomTest : public testing::Test {
     return filter_.size();
   }
 
+  Slice FilterData() const { return Slice(filter_); }
+
   void DumpFilter() {
     fprintf(stderr, "F(");
     for (size_t i = 0; i+1 < filter_.size(); i++) {
@@ -120,12 +130,12 @@ class BloomTest : public testing::Test {
   }
 };
 
-TEST_F(BloomTest, EmptyFilter) {
+TEST_F(BlockBasedBloomTest, EmptyFilter) {
   ASSERT_TRUE(! Matches("hello"));
   ASSERT_TRUE(! Matches("world"));
 }
 
-TEST_F(BloomTest, Small) {
+TEST_F(BlockBasedBloomTest, Small) {
   Add("hello");
   Add("world");
   ASSERT_TRUE(Matches("hello"));
@@ -134,7 +144,7 @@ TEST_F(BloomTest, Small) {
   ASSERT_TRUE(! Matches("foo"));
 }
 
-TEST_F(BloomTest, VaryingLengths) {
+TEST_F(BlockBasedBloomTest, VaryingLengths) {
   char buffer[sizeof(int)];
 
   // Count number of filters that significantly exceed the false positive rate
@@ -173,40 +183,121 @@ TEST_F(BloomTest, VaryingLengths) {
   ASSERT_LE(mediocre_filters, good_filters/5);
 }
 
+// Ensure the implementation doesn't accidentally change in an
+// incompatible way
+TEST_F(BlockBasedBloomTest, Schema) {
+  char buffer[sizeof(int)];
+
+  ResetPolicy(8);  // num_probes = 5
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 3589896109U);
+
+  ResetPolicy(9);  // num_probes = 6
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 969445585);
+
+  ResetPolicy(11);  // num_probes = 7
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 1694458207);
+
+  ResetPolicy(10);  // num_probes = 6
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 2373646410U);
+
+  ResetPolicy(10);
+  for (int key = /*CHANGED*/ 1; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 1908442116);
+
+  ResetPolicy(10);
+  for (int key = 1; key < /*CHANGED*/ 88; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), 3057004015U);
+
+  // With new fractional bits_per_key, check that we are rounding to
+  // whole bits per key for old Bloom filters.
+  ResetPolicy(9.5);  // Treated as 10
+  for (int key = 1; key < 88; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), /*SAME*/ 3057004015U);
+
+  ResetPolicy(10.499);  // Treated as 10
+  for (int key = 1; key < 88; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  ASSERT_EQ(BloomHash(FilterData()), /*SAME*/ 3057004015U);
+
+  ResetPolicy();
+}
+
 // Different bits-per-byte
 
-class FullBloomTest : public testing::Test {
+class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
  private:
-  const FilterPolicy* policy_;
+  BlockBasedTableOptions table_options_;
+  std::shared_ptr<const FilterPolicy>& policy_;
   std::unique_ptr<FilterBitsBuilder> bits_builder_;
   std::unique_ptr<FilterBitsReader> bits_reader_;
   std::unique_ptr<const char[]> buf_;
   size_t filter_size_;
 
  public:
-  FullBloomTest() :
-      policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)),
-      filter_size_(0) {
-    Reset();
+  FullBloomTest() : policy_(table_options_.filter_policy), filter_size_(0) {
+    ResetPolicy();
   }
 
-  ~FullBloomTest() override { delete policy_; }
+  BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
+    // Throws on bad cast
+    return &dynamic_cast<BuiltinFilterBitsBuilder&>(*bits_builder_);
+  }
 
-  FullFilterBitsBuilder* GetFullFilterBitsBuilder() {
-    return dynamic_cast<FullFilterBitsBuilder*>(bits_builder_.get());
+  const BloomFilterPolicy* GetBloomFilterPolicy() {
+    // Throws on bad cast
+    return &dynamic_cast<const BloomFilterPolicy&>(*policy_);
   }
 
   void Reset() {
-    bits_builder_.reset(policy_->GetFilterBitsBuilder());
+    bits_builder_.reset(BloomFilterPolicy::GetBuilderFromContext(
+        FilterBuildingContext(table_options_)));
     bits_reader_.reset(nullptr);
     buf_.reset(nullptr);
     filter_size_ = 0;
   }
 
+  void ResetPolicy(double bits_per_key) {
+    policy_.reset(new BloomFilterPolicy(bits_per_key, GetParam()));
+    Reset();
+  }
+
+  void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); }
+
   void Add(const Slice& s) {
     bits_builder_->AddKey(s);
   }
 
+  void OpenRaw(const Slice& s) {
+    bits_reader_.reset(policy_->GetFilterBitsReader(s));
+  }
+
   void Build() {
     Slice filter = bits_builder_->Finish(&buf_);
     bits_reader_.reset(policy_->GetFilterBitsReader(filter));
@@ -217,6 +308,18 @@ class FullBloomTest : public testing::Test {
     return filter_size_;
   }
 
+  Slice FilterData() { return Slice(buf_.get(), filter_size_); }
+
+  int GetNumProbesFromFilterData() {
+    assert(filter_size_ >= 5);
+    int8_t raw_num_probes = static_cast<int8_t>(buf_.get()[filter_size_ - 5]);
+    if (raw_num_probes == -1) {  // New bloom filter marker
+      return static_cast<uint8_t>(buf_.get()[filter_size_ - 3]);
+    } else {
+      return raw_num_probes;
+    }
+  }
+
   bool Matches(const Slice& s) {
     if (bits_reader_ == nullptr) {
       Build();
@@ -224,6 +327,39 @@ class FullBloomTest : public testing::Test {
     return bits_reader_->MayMatch(s);
   }
 
+  // Provides a kind of fingerprint on the Bloom filter's
+  // behavior, for reasonbly high FP rates.
+  uint64_t PackedMatches() {
+    char buffer[sizeof(int)];
+    uint64_t result = 0;
+    for (int i = 0; i < 64; i++) {
+      if (Matches(Key(i + 12345, buffer))) {
+        result |= uint64_t{1} << i;
+      }
+    }
+    return result;
+  }
+
+  // Provides a kind of fingerprint on the Bloom filter's
+  // behavior, for lower FP rates.
+  std::string FirstFPs(int count) {
+    char buffer[sizeof(int)];
+    std::string rv;
+    int fp_count = 0;
+    for (int i = 0; i < 1000000; i++) {
+      // Pack four match booleans into each hexadecimal digit
+      if (Matches(Key(i + 1000000, buffer))) {
+        ++fp_count;
+        rv += std::to_string(i);
+        if (fp_count == count) {
+          break;
+        }
+        rv += ',';
+      }
+    }
+    return rv;
+  }
+
   double FalsePositiveRate() {
     char buffer[sizeof(int)];
     int result = 0;
@@ -234,28 +370,85 @@ class FullBloomTest : public testing::Test {
     }
     return result / 10000.0;
   }
+
+  uint32_t SelectByImpl(uint32_t for_legacy_bloom,
+                        uint32_t for_fast_local_bloom) {
+    switch (GetParam()) {
+      case BloomFilterPolicy::kLegacyBloom:
+        return for_legacy_bloom;
+      case BloomFilterPolicy::kFastLocalBloom:
+        return for_fast_local_bloom;
+      case BloomFilterPolicy::kDeprecatedBlock:
+      case BloomFilterPolicy::kAuto:
+          /* N/A */;
+    }
+    // otherwise
+    assert(false);
+    return 0;
+  }
 };
 
-TEST_F(FullBloomTest, FilterSize) {
-  uint32_t dont_care1, dont_care2;
-  auto full_bits_builder = GetFullFilterBitsBuilder();
-  for (int n = 1; n < 100; n++) {
-    auto space = full_bits_builder->CalculateSpace(n, &dont_care1, &dont_care2);
-    auto n2 = full_bits_builder->CalculateNumEntry(space);
-    ASSERT_GE(n2, n);
-    auto space2 =
-        full_bits_builder->CalculateSpace(n2, &dont_care1, &dont_care2);
-    ASSERT_EQ(space, space2);
+TEST_P(FullBloomTest, FilterSize) {
+  // In addition to checking the consistency of space computation, we are
+  // checking that denoted and computed doubles are interpreted as expected
+  // as bits_per_key values.
+  bool some_computed_less_than_denoted = false;
+  // Note: enforced minimum is 1 bit per key (1000 millibits), and enforced
+  // maximum is 100 bits per key (100000 millibits).
+  for (auto bpk :
+       std::vector<std::pair<double, int> >{{-HUGE_VAL, 1000},
+                                            {-INFINITY, 1000},
+                                            {0.0, 1000},
+                                            {1.234, 1234},
+                                            {3.456, 3456},
+                                            {9.5, 9500},
+                                            {10.0, 10000},
+                                            {10.499, 10499},
+                                            {21.345, 21345},
+                                            {99.999, 99999},
+                                            {1234.0, 100000},
+                                            {HUGE_VAL, 100000},
+                                            {INFINITY, 100000},
+                                            {NAN, 100000}}) {
+    ResetPolicy(bpk.first);
+    auto bfp = GetBloomFilterPolicy();
+    EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
+    EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
+
+    double computed = bpk.first;
+    // This transforms e.g. 9.5 -> 9.499999999999998, which we still
+    // round to 10 for whole bits per key.
+    computed += 0.5;
+    computed /= 1234567.0;
+    computed *= 1234567.0;
+    computed -= 0.5;
+    some_computed_less_than_denoted |= (computed < bpk.first);
+    ResetPolicy(computed);
+    bfp = GetBloomFilterPolicy();
+    EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
+    EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
+
+    auto bits_builder = GetBuiltinFilterBitsBuilder();
+    for (int n = 1; n < 100; n++) {
+      auto space = bits_builder->CalculateSpace(n);
+      auto n2 = bits_builder->CalculateNumEntry(space);
+      EXPECT_GE(n2, n);
+      auto space2 = bits_builder->CalculateSpace(n2);
+      EXPECT_EQ(space, space2);
+    }
   }
+  // Check that the compiler hasn't optimized our computation into nothing
+  EXPECT_TRUE(some_computed_less_than_denoted);
+  ResetPolicy();
 }
 
-TEST_F(FullBloomTest, FullEmptyFilter) {
+TEST_P(FullBloomTest, FullEmptyFilter) {
   // Empty filter is not match, at this level
   ASSERT_TRUE(!Matches("hello"));
   ASSERT_TRUE(!Matches("world"));
 }
 
-TEST_F(FullBloomTest, FullSmall) {
+TEST_P(FullBloomTest, FullSmall) {
   Add("hello");
   Add("world");
   ASSERT_TRUE(Matches("hello"));
@@ -264,7 +457,7 @@ TEST_F(FullBloomTest, FullSmall) {
   ASSERT_TRUE(!Matches("foo"));
 }
 
-TEST_F(FullBloomTest, FullVaryingLengths) {
+TEST_P(FullBloomTest, FullVaryingLengths) {
   char buffer[sizeof(int)];
 
   // Count number of filters that significantly exceed the false positive rate
@@ -278,7 +471,8 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
     }
     Build();
 
-    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length;
+    ASSERT_LE(FilterSize(),
+              (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5));
 
     // All added keys must match
     for (int i = 0; i < length; i++) {
@@ -305,6 +499,407 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
   ASSERT_LE(mediocre_filters, good_filters/5);
 }
 
+namespace {
+inline uint32_t SelectByCacheLineSize(uint32_t for64, uint32_t for128,
+                                      uint32_t for256) {
+  (void)for64;
+  (void)for128;
+  (void)for256;
+#if CACHE_LINE_SIZE == 64
+  return for64;
+#elif CACHE_LINE_SIZE == 128
+  return for128;
+#elif CACHE_LINE_SIZE == 256
+  return for256;
+#else
+#error "CACHE_LINE_SIZE unknown or unrecognized"
+#endif
+}
+}  // namespace
+
+// Ensure the implementation doesn't accidentally change in an
+// incompatible way. This test doesn't check the reading side
+// (FirstFPs/PackedMatches) for LegacyBloom because it requires the
+// ability to read filters generated using other cache line sizes.
+// See RawSchema.
+TEST_P(FullBloomTest, Schema) {
+  char buffer[sizeof(int)];
+
+  // Use enough keys so that changing bits / key by 1 is guaranteed to
+  // change number of allocated cache lines. So keys > max cache line bits.
+
+  ResetPolicy(2);  // num_probes = 1
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 1);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(1567096579, 1964771444, 2659542661U),
+                   3817481309U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("11,13,17,25,29,30,35,37,45,53", FirstFPs(10));
+  }
+
+  ResetPolicy(3);  // num_probes = 2
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 2);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(2707206547U, 2571983456U, 218344685),
+                   2807269961U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("4,15,17,24,27,28,29,53,63,70", FirstFPs(10));
+  }
+
+  ResetPolicy(5);  // num_probes = 3
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 3);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(515748486, 94611728, 2436112214U),
+                   204628445));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("15,24,29,39,53,87,89,100,103,104", FirstFPs(10));
+  }
+
+  ResetPolicy(8);  // num_probes = 5
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 5);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(1302145999, 2811644657U, 756553699),
+                   355564975));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("16,60,66,126,220,238,244,256,265,287", FirstFPs(10));
+  }
+
+  ResetPolicy(9);  // num_probes = 6
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(2092755149, 661139132, 1182970461),
+                   2137566013U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("156,367,791,872,945,1015,1139,1159,1265,1435", FirstFPs(10));
+  }
+
+  ResetPolicy(11);  // num_probes = 7
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 7);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(3755609649U, 1812694762, 1449142939),
+                   2561502687U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
+  }
+
+  // This used to be 9 probes, but 8 is a better choice for speed,
+  // especially with SIMD groups of 8 probes, with essentially no
+  // change in FP rate.
+  // FP rate @ 9 probes, old Bloom: 0.4321%
+  // FP rate @ 9 probes, new Bloom: 0.1846%
+  // FP rate @ 8 probes, new Bloom: 0.1843%
+  ResetPolicy(14);  // num_probes = 8 (new), 9 (old)
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(9, 8));
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(178861123, 379087593, 2574136516U),
+                   3709876890U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9));
+  }
+
+  // This used to be 11 probes, but 9 is a better choice for speed
+  // AND accuracy.
+  // FP rate @ 11 probes, old Bloom: 0.3571%
+  // FP rate @ 11 probes, new Bloom: 0.0884%
+  // FP rate @  9 probes, new Bloom: 0.0843%
+  ResetPolicy(16);  // num_probes = 9 (new), 11 (old)
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(11, 9));
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(1129406313, 3049154394U, 1727750964),
+                   1087138490));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("3299,3611,3916,6620,7822,8079,8482,8942,10167", FirstFPs(9));
+  }
+
+  ResetPolicy(10);  // num_probes = 6, but different memory ratio vs. 9
+  for (int key = 0; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(1478976371, 2910591341U, 1182970461),
+                   2498541272U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
+  }
+
+  ResetPolicy(10);
+  for (int key = /*CHANGED*/ 1; key < 2087; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U),
+                   2058382345U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
+  }
+
+  ResetPolicy(10);
+  for (int key = 1; key < /*CHANGED*/ 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(
+      BloomHash(FilterData()),
+      SelectByImpl(SelectByCacheLineSize(2885052954U, 769447944, 4175124908U),
+                   23699164));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
+  }
+
+  // With new fractional bits_per_key, check that we are rounding to
+  // whole bits per key for old Bloom filters but fractional for
+  // new Bloom filter.
+  ResetPolicy(9.5);
+  for (int key = 1; key < 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ(BloomHash(FilterData()),
+            SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
+                                                        4175124908U),
+                         /*CHANGED*/ 3166884174U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ(/*CHANGED*/ "126,156,367,444,458,791,813,976,1015,1035",
+              FirstFPs(10));
+  }
+
+  ResetPolicy(10.499);
+  for (int key = 1; key < 2088; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(6, 7));
+  EXPECT_EQ(BloomHash(FilterData()),
+            SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
+                                                        4175124908U),
+                         /*CHANGED*/ 4098502778U));
+  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
+    EXPECT_EQ(/*CHANGED*/ "16,236,240,472,1015,1045,1111,1409,1465,1612",
+              FirstFPs(10));
+  }
+
+  ResetPolicy();
+}
+
+// A helper class for testing custom or corrupt filter bits as read by
+// built-in FilterBitsReaders.
+struct RawFilterTester {
+  // Buffer, from which we always return a tail Slice, so the
+  // last five bytes are always the metadata bytes.
+  std::array<char, 3000> data_;
+  // Points five bytes from the end
+  char* metadata_ptr_;
+
+  RawFilterTester() : metadata_ptr_(&*(data_.end() - 5)) {}
+
+  Slice ResetNoFill(uint32_t len_without_metadata, uint32_t num_lines,
+                     uint32_t num_probes) {
+    metadata_ptr_[0] = static_cast<char>(num_probes);
+    EncodeFixed32(metadata_ptr_ + 1, num_lines);
+    uint32_t len = len_without_metadata + /*metadata*/ 5;
+    assert(len <= data_.size());
+    return Slice(metadata_ptr_ - len_without_metadata, len);
+  }
+
+  Slice Reset(uint32_t len_without_metadata, uint32_t num_lines,
+               uint32_t num_probes, bool fill_ones) {
+    data_.fill(fill_ones ? 0xff : 0);
+    return ResetNoFill(len_without_metadata, num_lines, num_probes);
+  }
+
+  Slice ResetWeirdFill(uint32_t len_without_metadata, uint32_t num_lines,
+                        uint32_t num_probes) {
+    for (uint32_t i = 0; i < data_.size(); ++i) {
+      data_[i] = static_cast<char>(0x7b7b >> (i % 7));
+    }
+    return ResetNoFill(len_without_metadata, num_lines, num_probes);
+  }
+};
+
+TEST_P(FullBloomTest, RawSchema) {
+  RawFilterTester cft;
+  // Two probes, about 3/4 bits set: ~50% "FP" rate
+  // One 256-byte cache line.
+  OpenRaw(cft.ResetWeirdFill(256, 1, 2));
+  EXPECT_EQ(uint64_t{11384799501900898790U}, PackedMatches());
+
+  // Two 128-byte cache lines.
+  OpenRaw(cft.ResetWeirdFill(256, 2, 2));
+  EXPECT_EQ(uint64_t{10157853359773492589U}, PackedMatches());
+
+  // Four 64-byte cache lines.
+  OpenRaw(cft.ResetWeirdFill(256, 4, 2));
+  EXPECT_EQ(uint64_t{7123594913907464682U}, PackedMatches());
+}
+
+TEST_P(FullBloomTest, CorruptFilters) {
+  RawFilterTester cft;
+
+  for (bool fill : {false, true}) {
+    // Good filter bits - returns same as fill
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 6, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE * 3, 3, 6, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    // 256 is unusual but legal cache line size
+    OpenRaw(cft.Reset(256 * 3, 3, 6, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    // 30 should be max num_probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 30, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Good filter bits - returns same as fill
+    // 1 should be min num_probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 1, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Type 1 trivial filter bits - returns true as if FP by zero probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 0, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Type 2 trivial filter bits - returns false as if built from zero keys
+    OpenRaw(cft.Reset(0, 0, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Type 2 trivial filter bits - returns false as if built from zero keys
+    OpenRaw(cft.Reset(0, 37, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Type 2 trivial filter bits - returns false as 0 size trumps 0 probes
+    OpenRaw(cft.Reset(0, 0, 0, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // No solution to 0 * x == CACHE_LINE_SIZE
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 0, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // Can't have 3 * x == 4 for integer x
+    OpenRaw(cft.Reset(4, 3, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // 97 bytes is not a power of two, so not a legal cache line size
+    OpenRaw(cft.Reset(97 * 3, 3, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns true for safety
+    // 65 bytes is not a power of two, so not a legal cache line size
+    OpenRaw(cft.Reset(65 * 3, 3, 6, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Bad filter bits - returns false as if built from zero keys
+    // < 5 bytes overall means missing even metadata
+    OpenRaw(cft.Reset(-1, 3, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    OpenRaw(cft.Reset(-5, 3, 6, fill));
+    ASSERT_FALSE(Matches("hello"));
+    ASSERT_FALSE(Matches("world"));
+
+    // Dubious filter bits - returns same as fill (for now)
+    // 31 is not a useful num_probes, nor generated by RocksDB unless directly
+    // using filter bits API without BloomFilterPolicy.
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 31, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Dubious filter bits - returns same as fill (for now)
+    // Similar, with 127, largest positive char
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 127, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Dubious filter bits - returns true (for now)
+    // num_probes set to 128 / -128, lowest negative char
+    // NB: Bug in implementation interprets this as negative and has same
+    // effect as zero probes, but effectively reserves negative char values
+    // for future use.
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 128, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Dubious filter bits - returns true (for now)
+    // Similar, with 255 / -1
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(Full, FullBloomTest,
+                        testing::Values(BloomFilterPolicy::kLegacyBloom,
+                                        BloomFilterPolicy::kFastLocalBloom));
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/util/coding.h b/util/coding.h
index 4046a2b60bf..3ad6d957007 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -50,6 +50,8 @@ extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1,
 extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
 extern void PutLengthPrefixedSliceParts(std::string* dst,
                                         const SliceParts& slice_parts);
+extern void PutLengthPrefixedSlicePartsWithPadding(
+    std::string* dst, const SliceParts& slice_parts, size_t pad_sz);
 
 // Standard Get... routines parse a value from the beginning of a Slice
 // and advance the slice past the parsed value.
@@ -58,6 +60,7 @@ extern bool GetFixed32(Slice* input, uint32_t* value);
 extern bool GetFixed16(Slice* input, uint16_t* value);
 extern bool GetVarint32(Slice* input, uint32_t* value);
 extern bool GetVarint64(Slice* input, uint64_t* value);
+extern bool GetVarsignedint64(Slice* input, int64_t* value);
 extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
 // This function assumes data is well-formed.
 extern Slice GetLengthPrefixedSlice(const char* data);
@@ -305,9 +308,8 @@ inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
   dst->append(value.data(), value.size());
 }
 
-inline void PutLengthPrefixedSliceParts(std::string* dst,
+inline void PutLengthPrefixedSliceParts(std::string* dst, size_t total_bytes,
                                         const SliceParts& slice_parts) {
-  size_t total_bytes = 0;
   for (int i = 0; i < slice_parts.num_parts; ++i) {
     total_bytes += slice_parts.parts[i].size();
   }
@@ -317,6 +319,17 @@ inline void PutLengthPrefixedSliceParts(std::string* dst,
   }
 }
 
+inline void PutLengthPrefixedSliceParts(std::string* dst,
+                                        const SliceParts& slice_parts) {
+  PutLengthPrefixedSliceParts(dst, /*total_bytes=*/0, slice_parts);
+}
+
+inline void PutLengthPrefixedSlicePartsWithPadding(
+    std::string* dst, const SliceParts& slice_parts, size_t pad_sz) {
+  PutLengthPrefixedSliceParts(dst, /*total_bytes=*/pad_sz, slice_parts);
+  dst->append(pad_sz, '\0');
+}
+
 inline int VarintLength(uint64_t v) {
   int len = 1;
   while (v >= 128) {
@@ -377,6 +390,18 @@ inline bool GetVarint64(Slice* input, uint64_t* value) {
   }
 }
 
+inline bool GetVarsignedint64(Slice* input, int64_t* value) {
+  const char* p = input->data();
+  const char* limit = p + input->size();
+  const char* q = GetVarsignedint64Ptr(p, limit, value);
+  if (q == nullptr) {
+    return false;
+  } else {
+    *input = Slice(q, static_cast<size_t>(limit - q));
+    return true;
+  }
+}
+
 // Provide an interface for platform independent endianness transformation
 inline uint64_t EndianTransform(uint64_t input, size_t size) {
   char* pos = reinterpret_cast<char*>(&input);
diff --git a/util/coding_test.cc b/util/coding_test.cc
index f7b1671d1ec..7f73e00e155 100644
--- a/util/coding_test.cc
+++ b/util/coding_test.cc
@@ -9,7 +9,7 @@
 
 #include "util/coding.h"
 
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/comparator.cc b/util/comparator.cc
index b42c23725fc..717ebb52353 100644
--- a/util/comparator.cc
+++ b/util/comparator.cc
@@ -7,13 +7,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "rocksdb/comparator.h"
+#include <stdint.h>
 #include <algorithm>
 #include <memory>
-#include <stdint.h>
-#include "rocksdb/comparator.h"
-#include "rocksdb/slice.h"
+#include "logging/logging.h"
 #include "port/port.h"
-#include "util/logging.h"
+#include "rocksdb/slice.h"
 
 namespace rocksdb {
 
@@ -124,6 +124,10 @@ class BytewiseComparatorImpl : public Comparator {
   bool CanKeysWithDifferentByteContentsBeEqual() const override {
     return false;
   }
+
+  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+    return a.compare(b);
+  }
 };
 
 class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
@@ -192,6 +196,10 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
   bool CanKeysWithDifferentByteContentsBeEqual() const override {
     return false;
   }
+
+  int CompareWithoutTimestamp(const Slice& a, const Slice& b) const override {
+    return -a.compare(b);
+  }
 };
 }// namespace
 
diff --git a/util/compression.h b/util/compression.h
index b901ceb3518..c2db250f76c 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -20,11 +20,11 @@
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
 #include <string>
 
+#include "memory/memory_allocator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
-#include "util/memory_allocator.h"
 #include "util/string_util.h"
 
 #ifdef SNAPPY
@@ -217,35 +217,69 @@ struct CompressionDict {
 // Holds dictionary and related data, like ZSTD's digested uncompression
 // dictionary.
 struct UncompressionDict {
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a string parameter is used.
+  std::string dict_;
+
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a Slice parameter is used and the passed in
+  // CacheAllocationPtr is not nullptr.
+  CacheAllocationPtr allocation_;
+
+  // Slice pointing to the compression dictionary data. Can point to
+  // dict_, allocation_, or some other memory location, depending on how
+  // the object was constructed.
+  Slice slice_;
+
 #ifdef ROCKSDB_ZSTD_DDICT
-  ZSTD_DDict* zstd_ddict_;
+  // Processed version of the contents of slice_ for ZSTD compression.
+  ZSTD_DDict* zstd_ddict_ = nullptr;
 #endif  // ROCKSDB_ZSTD_DDICT
-  // Block containing the data for the compression dictionary. It may be
-  // redundant with the data held in `zstd_ddict_`.
-  std::string dict_;
-  // This `Statistics` pointer is intended to be used upon block cache eviction,
-  // so only needs to be populated on `UncompressionDict`s that'll be inserted
-  // into block cache.
-  Statistics* statistics_;
 
 #ifdef ROCKSDB_ZSTD_DDICT
-  UncompressionDict(std::string dict, bool using_zstd,
-                    Statistics* _statistics = nullptr) {
+  UncompressionDict(std::string dict, bool using_zstd)
 #else   // ROCKSDB_ZSTD_DDICT
-  UncompressionDict(std::string dict, bool /*using_zstd*/,
-                    Statistics* _statistics = nullptr) {
+  UncompressionDict(std::string dict, bool /* using_zstd */)
 #endif  // ROCKSDB_ZSTD_DDICT
-    dict_ = std::move(dict);
-    statistics_ = _statistics;
+      : dict_(std::move(dict)), slice_(dict_) {
 #ifdef ROCKSDB_ZSTD_DDICT
-    zstd_ddict_ = nullptr;
-    if (!dict_.empty() && using_zstd) {
-      zstd_ddict_ = ZSTD_createDDict_byReference(dict_.data(), dict_.size());
+    if (!slice_.empty() && using_zstd) {
+      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
       assert(zstd_ddict_ != nullptr);
     }
 #endif  // ROCKSDB_ZSTD_DDICT
   }
 
+#ifdef ROCKSDB_ZSTD_DDICT
+  UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
+                    bool using_zstd)
+#else   // ROCKSDB_ZSTD_DDICT
+  UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
+                    bool /* using_zstd */)
+#endif  // ROCKSDB_ZSTD_DDICT
+      : allocation_(std::move(allocation)), slice_(std::move(slice)) {
+#ifdef ROCKSDB_ZSTD_DDICT
+    if (!slice_.empty() && using_zstd) {
+      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
+      assert(zstd_ddict_ != nullptr);
+    }
+#endif  // ROCKSDB_ZSTD_DDICT
+  }
+
+  UncompressionDict(UncompressionDict&& rhs)
+      : dict_(std::move(rhs.dict_)),
+        allocation_(std::move(rhs.allocation_)),
+        slice_(std::move(rhs.slice_))
+#ifdef ROCKSDB_ZSTD_DDICT
+        ,
+        zstd_ddict_(rhs.zstd_ddict_)
+#endif
+  {
+#ifdef ROCKSDB_ZSTD_DDICT
+    rhs.zstd_ddict_ = nullptr;
+#endif
+  }
+
   ~UncompressionDict() {
 #ifdef ROCKSDB_ZSTD_DDICT
     size_t res = 0;
@@ -257,35 +291,61 @@ struct UncompressionDict {
 #endif                 // ROCKSDB_ZSTD_DDICT
   }
 
+  UncompressionDict& operator=(UncompressionDict&& rhs) {
+    if (this == &rhs) {
+      return *this;
+    }
+
+    dict_ = std::move(rhs.dict_);
+    allocation_ = std::move(rhs.allocation_);
+    slice_ = std::move(rhs.slice_);
+
+#ifdef ROCKSDB_ZSTD_DDICT
+    zstd_ddict_ = rhs.zstd_ddict_;
+    rhs.zstd_ddict_ = nullptr;
+#endif
+
+    return *this;
+  }
+
+  // The object is self-contained if the string constructor is used, or the
+  // Slice constructor is invoked with a non-null allocation. Otherwise, it
+  // is the caller's responsibility to ensure that the underlying storage
+  // outlives this object.
+  bool own_bytes() const { return !dict_.empty() || allocation_; }
+
+  const Slice& GetRawDict() const { return slice_; }
+
 #ifdef ROCKSDB_ZSTD_DDICT
   const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; }
 #endif  // ROCKSDB_ZSTD_DDICT
 
-  Slice GetRawDict() const { return dict_; }
-
   static const UncompressionDict& GetEmptyDict() {
     static UncompressionDict empty_dict{};
     return empty_dict;
   }
 
-  Statistics* statistics() const { return statistics_; }
-
-  size_t ApproximateMemoryUsage() {
-    size_t usage = 0;
-    usage += sizeof(struct UncompressionDict);
+  size_t ApproximateMemoryUsage() const {
+    size_t usage = sizeof(struct UncompressionDict);
+    usage += dict_.size();
+    if (allocation_) {
+      auto allocator = allocation_.get_deleter().allocator;
+      if (allocator) {
+        usage += allocator->UsableSize(allocation_.get(), slice_.size());
+      } else {
+        usage += slice_.size();
+      }
+    }
 #ifdef ROCKSDB_ZSTD_DDICT
     usage += ZSTD_sizeof_DDict(zstd_ddict_);
 #endif  // ROCKSDB_ZSTD_DDICT
-    usage += dict_.size();
     return usage;
   }
 
   UncompressionDict() = default;
-  // Disable copy/move
+  // Disable copy
   UncompressionDict(const CompressionDict&) = delete;
   UncompressionDict& operator=(const CompressionDict&) = delete;
-  UncompressionDict(CompressionDict&&) = delete;
-  UncompressionDict& operator=(CompressionDict&&) = delete;
 };
 
 class CompressionContext {
@@ -725,7 +785,7 @@ inline CacheAllocationPtr Zlib_Uncompress(
     return nullptr;
   }
 
-  Slice compression_dict = info.dict().GetRawDict();
+  const Slice& compression_dict = info.dict().GetRawDict();
   if (compression_dict.size()) {
     // Initialize the compression library's dictionary
     st = inflateSetDictionary(
@@ -1040,7 +1100,7 @@ inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info,
   auto output = AllocateBlock(output_len, allocator);
 #if LZ4_VERSION_NUMBER >= 10400  // r124+
   LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
-  Slice compression_dict = info.dict().GetRawDict();
+  const Slice& compression_dict = info.dict().GetRawDict();
   if (compression_dict.size()) {
     LZ4_setStreamDecode(stream, compression_dict.data(),
                         static_cast<int>(compression_dict.size()));
diff --git a/util/concurrent_task_limiter_impl.h b/util/concurrent_task_limiter_impl.h
index 515f1481e04..4e6251f6ae3 100644
--- a/util/concurrent_task_limiter_impl.h
+++ b/util/concurrent_task_limiter_impl.h
@@ -22,6 +22,10 @@ class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
  public:
   explicit ConcurrentTaskLimiterImpl(const std::string& name,
                                      int32_t max_outstanding_task);
+  // No copying allowed
+  ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete;
+  ConcurrentTaskLimiterImpl& operator=(const ConcurrentTaskLimiterImpl&) =
+      delete;
 
   virtual ~ConcurrentTaskLimiterImpl();
 
@@ -44,11 +48,6 @@ class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
   std::string name_;
   std::atomic<int32_t> max_outstanding_tasks_;
   std::atomic<int32_t> outstanding_tasks_;
-
-  // No copying allowed
-  ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete;
-  ConcurrentTaskLimiterImpl& operator=(
-      const ConcurrentTaskLimiterImpl&) = delete;
 };
 
 class TaskLimiterToken {
diff --git a/util/crc32c.cc b/util/crc32c.cc
index 9e4b65e66e1..9e838b830f5 100644
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
@@ -18,6 +18,8 @@
 #include "util/coding.h"
 #include "util/util.h"
 
+#include "util/crc32c_arm64.h"
+
 #ifdef __powerpc64__
 #include "util/crc32c_ppc.h"
 #include "util/crc32c_ppc_constants.h"
@@ -396,6 +398,8 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
   return static_cast<uint32_t>(l ^ 0xffffffffu);
 }
 
+// Detect if ARM64 CRC or not.
+#ifndef HAVE_ARM64_CRC
 // Detect if SS42 or not.
 #ifndef HAVE_POWER8
 
@@ -434,6 +438,7 @@ static bool isPCLMULQDQ() {
 }
 
 #endif  // HAVE_POWER8
+#endif  // HAVE_ARM64_CRC
 
 typedef uint32_t (*Function)(uint32_t, const char*, size_t);
 
@@ -463,6 +468,11 @@ static bool isAltiVec() {
 }
 #endif
 
+#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) {
+  return crc32c_arm64(crc, (const unsigned char *)buf, size);
+}
+#endif
 
 std::string IsFastCrc32Supported() {
   bool has_fast_crc = false;
@@ -478,6 +488,14 @@ std::string IsFastCrc32Supported() {
   has_fast_crc = false;
   arch = "PPC";
 #endif
+#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+  if (crc32c_runtime_check()) {
+    has_fast_crc = true;
+    arch = "Arm64";
+  } else {
+    has_fast_crc = false;
+    arch = "Arm64";
+  }
 #else
   has_fast_crc = isSSE42();
   arch = "x86";
@@ -1200,7 +1218,15 @@ uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
 #endif //HAVE_SSE42 && HAVE_PCLMUL
 
 static inline Function Choose_Extend() {
-#ifndef HAVE_POWER8
+#ifdef HAVE_POWER8
+  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
+#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+  if(crc32c_runtime_check()) {
+    return ExtendARMImpl;
+  } else {
+    return ExtendImpl<Slow_CRC32>;
+  }
+#else
   if (isSSE42()) {
     if (isPCLMULQDQ()) {
 #if defined HAVE_SSE42  && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C
@@ -1216,8 +1242,6 @@ static inline Function Choose_Extend() {
   else {
     return ExtendImpl<Slow_CRC32>;
   }
-#else  //HAVE_POWER8
-  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
 #endif
 }
 
diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
new file mode 100644
index 00000000000..591c623a5e5
--- /dev/null
+++ b/util/crc32c_arm64.cc
@@ -0,0 +1,129 @@
+//  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/crc32c_arm64.h"
+
+#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+
+#ifdef HAVE_ARM64_CRYPTO
+/* unfolding to compute 8 * 3 = 24 bytes parallelly */
+#define CRC32C24BYTES(ITR)                                    \
+  crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR)));     \
+  crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH * 2 + (ITR))); \
+  crc0 = crc32c_u64(crc0, *(buf64 + (ITR)));
+
+/* unfolding to compute 24 * 7 = 168 bytes parallelly */
+#define CRC32C7X24BYTES(ITR)   \
+  do {                         \
+    CRC32C24BYTES((ITR)*7 + 0) \
+    CRC32C24BYTES((ITR)*7 + 1) \
+    CRC32C24BYTES((ITR)*7 + 2) \
+    CRC32C24BYTES((ITR)*7 + 3) \
+    CRC32C24BYTES((ITR)*7 + 4) \
+    CRC32C24BYTES((ITR)*7 + 5) \
+    CRC32C24BYTES((ITR)*7 + 6) \
+  } while (0)
+#endif
+
+uint32_t crc32c_runtime_check(void) {
+  uint64_t auxv = getauxval(AT_HWCAP);
+  return (auxv & HWCAP_CRC32) != 0;
+}
+
+uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
+                             unsigned len) {
+  const uint8_t *buf8;
+  const uint64_t *buf64 = (uint64_t *)data;
+  int length = (int)len;
+  crc ^= 0xffffffff;
+
+#ifdef HAVE_ARM64_CRYPTO
+/* Crc32c Parallel computation
+ *   Algorithm comes from Intel whitepaper:
+ *   crc-iscsi-polynomial-crc32-instruction-paper
+ *
+ * Input data is divided into three equal-sized blocks
+ *   Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes
+ *   One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes
+ */
+#define BLK_LENGTH 42
+  while (length >= 1024) {
+    uint64_t t0, t1;
+    uint32_t crc0 = 0, crc1 = 0, crc2 = 0;
+
+    /* Parallel Param:
+     *   k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1));
+     *   k1 = CRC32(x ^ (42 * 8 * 8 - 1));
+     */
+    uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
+
+    /* Prefetch data for following block to avoid cache miss */
+    PREF1KL1((uint8_t *)buf64, 1024);
+
+    /* First 8 byte for better pipelining */
+    crc0 = crc32c_u64(crc, *buf64++);
+
+    /* 3 blocks crc32c parallel computation
+     * Macro unfolding to compute parallelly
+     * 168 * 6 = 1008 (bytes)
+     */
+    CRC32C7X24BYTES(0);
+    CRC32C7X24BYTES(1);
+    CRC32C7X24BYTES(2);
+    CRC32C7X24BYTES(3);
+    CRC32C7X24BYTES(4);
+    CRC32C7X24BYTES(5);
+    buf64 += (BLK_LENGTH * 3);
+
+    /* Last 8 bytes */
+    crc = crc32c_u64(crc2, *buf64++);
+
+    t0 = (uint64_t)vmull_p64(crc0, k0);
+    t1 = (uint64_t)vmull_p64(crc1, k1);
+
+    /* Merge (crc0, crc1, crc2) -> crc */
+    crc1 = crc32c_u64(0, t1);
+    crc ^= crc1;
+    crc0 = crc32c_u64(0, t0);
+    crc ^= crc0;
+
+    length -= 1024;
+  }
+
+  if (length == 0) return crc ^ (0xffffffffU);
+#endif
+  buf8 = (const uint8_t *)buf64;
+  while (length >= 8) {
+    crc = crc32c_u64(crc, *(const uint64_t *)buf8);
+    buf8 += 8;
+    length -= 8;
+  }
+
+  /* The following is more efficient than the straight loop */
+  if (length >= 4) {
+    crc = crc32c_u32(crc, *(const uint32_t *)buf8);
+    buf8 += 4;
+    length -= 4;
+  }
+
+  if (length >= 2) {
+    crc = crc32c_u16(crc, *(const uint16_t *)buf8);
+    buf8 += 2;
+    length -= 2;
+  }
+
+  if (length >= 1) crc = crc32c_u8(crc, *buf8);
+
+  crc ^= 0xffffffff;
+  return crc;
+}
+
+#endif
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
new file mode 100644
index 00000000000..66fe30c14f2
--- /dev/null
+++ b/util/crc32c_arm64.h
@@ -0,0 +1,47 @@
+//  Copyright (c) 2018, Arm Limited and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef UTIL_CRC32C_ARM64_H
+#define UTIL_CRC32C_ARM64_H
+
+#include <cinttypes>
+
+#if defined(__aarch64__) || defined(__AARCH64__)
+
+#ifdef __ARM_FEATURE_CRC32
+#define HAVE_ARM64_CRC
+#include <arm_acle.h>
+#define crc32c_u8(crc, v) __crc32cb(crc, v)
+#define crc32c_u16(crc, v) __crc32ch(crc, v)
+#define crc32c_u32(crc, v) __crc32cw(crc, v)
+#define crc32c_u64(crc, v) __crc32cd(crc, v)
+#define PREF4X64L1(buffer, PREF_OFFSET, ITR)                \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 0) * 64));       \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 1) * 64));       \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 2) * 64));       \
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \
+          [c] "I"((PREF_OFFSET) + ((ITR) + 3) * 64));
+
+#define PREF1KL1(buffer, PREF_OFFSET)  \
+  PREF4X64L1(buffer, (PREF_OFFSET), 0) \
+  PREF4X64L1(buffer, (PREF_OFFSET), 4) \
+  PREF4X64L1(buffer, (PREF_OFFSET), 8) \
+  PREF4X64L1(buffer, (PREF_OFFSET), 12)
+
+extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
+extern uint32_t crc32c_runtime_check(void);
+
+#ifdef __ARM_FEATURE_CRYPTO
+#define HAVE_ARM64_CRYPTO
+#include <arm_neon.h>
+#endif  // __ARM_FEATURE_CRYPTO
+#endif  // __ARM_FEATURE_CRC32
+
+#endif  // defined(__aarch64__) || defined(__AARCH64__)
+
+#endif
diff --git a/util/crc32c_ppc.c b/util/crc32c_ppc.c
index d9467d28262..888a4943eaa 100644
--- a/util/crc32c_ppc.c
+++ b/util/crc32c_ppc.c
@@ -1,14 +1,12 @@
 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  Copyright (c) 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #define CRC_TABLE
-#include <inttypes.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <strings.h>
 #include "util/crc32c_ppc_constants.h"
diff --git a/util/crc32c_ppc.h b/util/crc32c_ppc.h
index 64a81a43102..c359061c610 100644
--- a/util/crc32c_ppc.h
+++ b/util/crc32c_ppc.h
@@ -1,11 +1,9 @@
 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  Copyright (c) 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 
diff --git a/util/crc32c_ppc_asm.S b/util/crc32c_ppc_asm.S
index 5142a8f259b..a317bf96b87 100644
--- a/util/crc32c_ppc_asm.S
+++ b/util/crc32c_ppc_asm.S
@@ -2,11 +2,9 @@
 //  Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
 //  Copyright (c) 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #include <ppc-asm.h>
 #include "ppc-opcode.h"
diff --git a/util/crc32c_ppc_constants.h b/util/crc32c_ppc_constants.h
index 21ec6fd9458..f6494cd01c3 100644
--- a/util/crc32c_ppc_constants.h
+++ b/util/crc32c_ppc_constants.h
@@ -1,11 +1,9 @@
 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  Copyright (C) 2015, 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 
diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc
index d5983586bc6..90f0c815cc2 100644
--- a/util/crc32c_test.cc
+++ b/util/crc32c_test.cc
@@ -7,7 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "util/crc32c.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "util/coding.h"
 
 namespace rocksdb {
diff --git a/util/duplicate_detector.h b/util/duplicate_detector.h
index 40a1cbd129b..1fab009751b 100644
--- a/util/duplicate_detector.h
+++ b/util/duplicate_detector.h
@@ -5,11 +5,7 @@
 
 #pragma once
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
 #include "util/set_comparator.h"
 
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 8e90efd89a7..ebbae149de3 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -7,70 +7,64 @@
 
 #include <algorithm>
 
+#include "memory/allocator.h"
 #include "port/port.h"
 #include "rocksdb/slice.h"
-#include "util/allocator.h"
 #include "util/hash.h"
 
 namespace rocksdb {
 
 namespace {
 
-uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
-  uint32_t num_blocks =
-      (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8);
-
-  // Make num_blocks an odd number to make sure more bits are involved
-  // when determining which block.
-  if (num_blocks % 2 == 0) {
-    num_blocks++;
+uint32_t roundUpToPow2(uint32_t x) {
+  uint32_t rv = 1;
+  while (rv < x) {
+    rv <<= 1;
   }
-
-  return num_blocks * (CACHE_LINE_SIZE * 8);
+  return rv;
 }
 }
 
 DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
-                           uint32_t locality, uint32_t num_probes,
-                           size_t huge_page_tlb_size, Logger* logger)
-    : DynamicBloom(num_probes) {
-  SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger);
-}
-
-DynamicBloom::DynamicBloom(uint32_t num_probes)
-    : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
-
-void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
-                              uint32_t num_blocks) {
-  data_ = reinterpret_cast<std::atomic<uint8_t>*>(raw_data);
-  kTotalBits = total_bits;
-  kNumBlocks = num_blocks;
-}
-
-void DynamicBloom::SetTotalBits(Allocator* allocator,
-                                uint32_t total_bits, uint32_t locality,
-                                size_t huge_page_tlb_size,
-                                Logger* logger) {
-  kTotalBits = (locality > 0) ? GetTotalBitsForLocality(total_bits)
-                              : (total_bits + 7) / 8 * 8;
-  kNumBlocks = (locality > 0) ? (kTotalBits / (CACHE_LINE_SIZE * 8)) : 0;
-
-  assert(kNumBlocks > 0 || kTotalBits > 0);
-  assert(kNumProbes > 0);
-
-  uint32_t sz = kTotalBits / 8;
-  if (kNumBlocks > 0) {
-    sz += CACHE_LINE_SIZE - 1;
+                           uint32_t num_probes, size_t huge_page_tlb_size,
+                           Logger* logger)
+    // Round down, except round up with 1
+    : kNumDoubleProbes((num_probes + (num_probes == 1)) / 2) {
+  assert(num_probes % 2 == 0);  // limitation of current implementation
+  assert(num_probes <= 10);     // limitation of current implementation
+  assert(kNumDoubleProbes > 0);
+
+  // Determine how much to round off + align by so that x ^ i (that's xor) is
+  // a valid u64 index if x is a valid u64 index and 0 <= i < kNumDoubleProbes.
+  uint32_t block_bytes = /*bytes/u64*/ 8 *
+                         /*u64s*/ std::max(1U, roundUpToPow2(kNumDoubleProbes));
+  uint32_t block_bits = block_bytes * 8;
+  uint32_t blocks = (total_bits + block_bits - 1) / block_bits;
+  uint32_t sz = blocks * block_bytes;
+  kLen = sz / /*bytes/u64*/ 8;
+  assert(kLen > 0);
+#ifndef NDEBUG
+  for (uint32_t i = 0; i < kNumDoubleProbes; ++i) {
+    // Ensure probes starting at last word are in range
+    assert(((kLen - 1) ^ i) < kLen);
   }
+#endif
+
+  // Padding to correct for allocation not originally aligned on block_bytes
+  // boundary
+  sz += block_bytes - 1;
   assert(allocator);
 
   char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger);
   memset(raw, 0, sz);
-  auto cache_line_offset = reinterpret_cast<uintptr_t>(raw) % CACHE_LINE_SIZE;
-  if (kNumBlocks > 0 && cache_line_offset > 0) {
-    raw += CACHE_LINE_SIZE - cache_line_offset;
+  auto block_offset = reinterpret_cast<uintptr_t>(raw) % block_bytes;
+  if (block_offset > 0) {
+    // Align on block_bytes boundary
+    raw += block_bytes - block_offset;
   }
-  data_ = reinterpret_cast<std::atomic<uint8_t>*>(raw);
+  static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+                "Expecting zero-space-overhead atomic");
+  data_ = reinterpret_cast<std::atomic<uint64_t>*>(raw);
 }
 
 }  // rocksdb
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index 654bc9ad5f3..eb19c369dc8 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -5,11 +5,11 @@
 
 #pragma once
 
+#include <array>
 #include <string>
-
-#include "rocksdb/slice.h"
-
 #include "port/port.h"
+#include "rocksdb/slice.h"
+#include "table/multiget_context.h"
 #include "util/hash.h"
 
 #include <atomic>
@@ -21,30 +21,33 @@ class Slice;
 class Allocator;
 class Logger;
 
+// A Bloom filter intended only to be used in memory, never serialized in a way
+// that could lead to schema incompatibility. Supports opt-in lock-free
+// concurrent access.
+//
+// This implementation is also intended for applications generally preferring
+// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate.
+// For 1% FP rate, that means that the latency of a look-up triggered by an FP
+// should be less than roughly 100x the cost of a Bloom filter op.
+//
+// For simplicity and performance, the current implementation requires
+// num_probes to be a multiple of two and <= 10.
+//
 class DynamicBloom {
  public:
   // allocator: pass allocator to bloom filter, hence trace the usage of memory
   // total_bits: fixed total bits for the bloom
   // num_probes: number of hash probes for a single key
-  // locality:  If positive, optimize for cache line locality, 0 otherwise.
   // hash_func:  customized hash function
   // huge_page_tlb_size:  if >0, try to allocate bloom bytes from huge page TLB
   //                      within this page size. Need to reserve huge pages for
   //                      it to be allocated, like:
   //                         sysctl -w vm.nr_hugepages=20
   //                     See linux doc Documentation/vm/hugetlbpage.txt
-  explicit DynamicBloom(Allocator* allocator,
-                        uint32_t total_bits, uint32_t locality = 0,
-                        uint32_t num_probes = 6,
-                        size_t huge_page_tlb_size = 0,
+  explicit DynamicBloom(Allocator* allocator, uint32_t total_bits,
+                        uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
                         Logger* logger = nullptr);
 
-  explicit DynamicBloom(uint32_t num_probes = 6);
-
-  void SetTotalBits(Allocator* allocator, uint32_t total_bits,
-                    uint32_t locality, size_t huge_page_tlb_size,
-                    Logger* logger);
-
   ~DynamicBloom() {}
 
   // Assuming single threaded access to this function.
@@ -62,35 +65,29 @@ class DynamicBloom {
   // Multithreaded access to this function is OK
   bool MayContain(const Slice& key) const;
 
+  void MayContain(int num_keys, Slice** keys, bool* may_match) const;
+
   // Multithreaded access to this function is OK
   bool MayContainHash(uint32_t hash) const;
 
   void Prefetch(uint32_t h);
 
-  uint32_t GetNumBlocks() const { return kNumBlocks; }
-
-  Slice GetRawData() const {
-    return Slice(reinterpret_cast<char*>(data_), GetTotalBits() / 8);
-  }
-
-  void SetRawData(unsigned char* raw_data, uint32_t total_bits,
-                  uint32_t num_blocks = 0);
-
-  uint32_t GetTotalBits() const { return kTotalBits; }
-
-  bool IsInitialized() const { return kNumBlocks > 0 || kTotalBits > 0; }
-
  private:
-  uint32_t kTotalBits;
-  uint32_t kNumBlocks;
-  const uint32_t kNumProbes;
+  // Length of the structure, in 64-bit words. For this structure, "word"
+  // will always refer to 64-bit words.
+  uint32_t kLen;
+  // We make the k probes in pairs, two for each 64-bit read/write. Thus,
+  // this stores k/2, the number of words to double-probe.
+  const uint32_t kNumDoubleProbes;
 
-  std::atomic<uint8_t>* data_;
+  std::atomic<uint64_t>* data_;
 
   // or_func(ptr, mask) should effect *ptr |= mask with the appropriate
   // concurrency safety, working with bytes.
   template <typename OrFunc>
   void AddHash(uint32_t hash, const OrFunc& or_func);
+
+  bool DoubleProbe(uint32_t h32, size_t a) const;
 };
 
 inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); }
@@ -100,14 +97,14 @@ inline void DynamicBloom::AddConcurrently(const Slice& key) {
 }
 
 inline void DynamicBloom::AddHash(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) {
+  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
     ptr->store(ptr->load(std::memory_order_relaxed) | mask,
                std::memory_order_relaxed);
   });
 }
 
 inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint8_t>* ptr, uint8_t mask) {
+  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
     // Happens-before between AddHash and MaybeContains is handled by
     // access to versions_->LastSequence(), so all we have to do here is
     // avoid races (so we don't give the compiler a license to mess up
@@ -123,74 +120,94 @@ inline bool DynamicBloom::MayContain(const Slice& key) const {
   return (MayContainHash(BloomHash(key)));
 }
 
+inline void DynamicBloom::MayContain(int num_keys, Slice** keys,
+                                     bool* may_match) const {
+  std::array<uint32_t, MultiGetContext::MAX_BATCH_SIZE> hashes;
+  std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> byte_offsets;
+  for (int i = 0; i < num_keys; ++i) {
+    hashes[i] = BloomHash(*keys[i]);
+    size_t a = fastrange32(kLen, hashes[i]);
+    PREFETCH(data_ + a, 0, 3);
+    byte_offsets[i] = a;
+  }
+
+  for (int i = 0; i < num_keys; i++) {
+    may_match[i] = DoubleProbe(hashes[i], byte_offsets[i]);
+  }
+}
+
 #if defined(_MSC_VER)
 #pragma warning(push)
 // local variable is initialized but not referenced
-#pragma warning(disable : 4189) 
+#pragma warning(disable : 4189)
 #endif
-inline void DynamicBloom::Prefetch(uint32_t h) {
-  if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    PREFETCH(&(data_[b / 8]), 0, 3);
-  }
+inline void DynamicBloom::Prefetch(uint32_t h32) {
+  size_t a = fastrange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
 }
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
 
-inline bool DynamicBloom::MayContainHash(uint32_t h) const {
-  assert(IsInitialized());
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      //  to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed);
-      if ((byteval & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
-  } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      uint8_t byteval = data_[bitpos / 8].load(std::memory_order_relaxed);
-      if ((byteval & (1 << (bitpos % 8))) == 0) {
-        return false;
-      }
-      h += delta;
+// Speed hacks in this implementation:
+// * Uses fastrange instead of %
+// * Minimum logic to determine first (and all) probed memory addresses.
+//   (Uses constant bit-xor offsets from the starting probe address.)
+// * (Major) Two probes per 64-bit memory fetch/write.
+//   Code simplification / optimization: only allow even number of probes.
+// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At
+// least on recent CPUs, integer multiplication is very cheap. Each 64-bit
+// remix provides five pairs of bit addresses within a uint64_t.)
+//   Code simplification / optimization: only allow up to 10 probes, from a
+//   single 64-bit remix.
+//
+// The FP rate penalty for this implementation, vs. standard Bloom filter, is
+// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom.
+// This implementation does not explicitly use the cache line size, but is
+// effectively cache-local (up to 16 probes) because of the bit-xor offsetting.
+//
+// NB: could easily be upgraded to support a 64-bit hash and
+// total_bits > 2^32 (512MB). (The latter is a bad idea without the former,
+// because of false positives.)
+
+inline bool DynamicBloom::MayContainHash(uint32_t h32) const {
+  size_t a = fastrange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
+  return DoubleProbe(h32, a);
+}
+
+inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const {
+  // Expand/remix with 64-bit golden ratio
+  uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+  for (unsigned i = 0;; ++i) {
+    // Two bit probes per uint64_t probe
+    uint64_t mask =
+        ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
+    uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed);
+    if (i + 1 >= kNumDoubleProbes) {
+      return (val & mask) == mask;
+    } else if ((val & mask) != mask) {
+      return false;
     }
+    h = (h >> 12) | (h << 52);
   }
-  return true;
 }
 
 template <typename OrFunc>
-inline void DynamicBloom::AddHash(uint32_t h, const OrFunc& or_func) {
-  assert(IsInitialized());
-  const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  if (kNumBlocks != 0) {
-    uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
-      // to a simple and operation by compiler.
-      const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
-      or_func(&data_[bitpos / 8], (1 << (bitpos % 8)));
-      // Rotate h so that we don't reuse the same bytes.
-      h = h / (CACHE_LINE_SIZE * 8) +
-          (h % (CACHE_LINE_SIZE * 8)) * (0x20000000U / CACHE_LINE_SIZE);
-      h += delta;
-    }
-  } else {
-    for (uint32_t i = 0; i < kNumProbes; ++i) {
-      const uint32_t bitpos = h % kTotalBits;
-      or_func(&data_[bitpos / 8], (1 << (bitpos % 8)));
-      h += delta;
+inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) {
+  size_t a = fastrange32(kLen, h32);
+  PREFETCH(data_ + a, 0, 3);
+  // Expand/remix with 64-bit golden ratio
+  uint64_t h = 0x9e3779b97f4a7c13ULL * h32;
+  for (unsigned i = 0;; ++i) {
+    // Two bit probes per uint64_t probe
+    uint64_t mask =
+        ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
+    or_func(&data_[a ^ i], mask);
+    if (i + 1 >= kNumDoubleProbes) {
+      return;
     }
+    h = (h >> 12) | (h << 52);
   }
 }
 
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 4244bff1a4e..e4a8c66e307 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -11,26 +11,22 @@ int main() {
 }
 #else
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <functional>
 #include <memory>
 #include <thread>
 #include <vector>
 
 #include "dynamic_bloom.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
 #include "port/port.h"
-#include "util/arena.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
-#include "util/logging.h"
 #include "util/stop_watch.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
@@ -40,27 +36,42 @@ DEFINE_bool(enable_perf, false, "");
 
 namespace rocksdb {
 
-static Slice Key(uint64_t i, char* buffer) {
-  memcpy(buffer, &i, sizeof(i));
-  return Slice(buffer, sizeof(i));
-}
+struct KeyMaker {
+  uint64_t a;
+  uint64_t b;
+
+  // Sequential, within a hash function block
+  inline Slice Seq(uint64_t i) {
+    a = i;
+    return Slice(reinterpret_cast<char *>(&a), sizeof(a));
+  }
+  // Not quite sequential, varies across hash function blocks
+  inline Slice Nonseq(uint64_t i) {
+    a = i;
+    b = i * 123;
+    return Slice(reinterpret_cast<char *>(this), sizeof(*this));
+  }
+  inline Slice Key(uint64_t i, bool nonseq) {
+    return nonseq ? Nonseq(i) : Seq(i);
+  }
+};
 
 class DynamicBloomTest : public testing::Test {};
 
 TEST_F(DynamicBloomTest, EmptyFilter) {
   Arena arena;
-  DynamicBloom bloom1(&arena, 100, 0, 2);
+  DynamicBloom bloom1(&arena, 100, 2);
   ASSERT_TRUE(!bloom1.MayContain("hello"));
   ASSERT_TRUE(!bloom1.MayContain("world"));
 
-  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
   ASSERT_TRUE(!bloom2.MayContain("hello"));
   ASSERT_TRUE(!bloom2.MayContain("world"));
 }
 
 TEST_F(DynamicBloomTest, Small) {
   Arena arena;
-  DynamicBloom bloom1(&arena, 100, 0, 2);
+  DynamicBloom bloom1(&arena, 100, 2);
   bloom1.Add("hello");
   bloom1.Add("world");
   ASSERT_TRUE(bloom1.MayContain("hello"));
@@ -68,7 +79,7 @@ TEST_F(DynamicBloomTest, Small) {
   ASSERT_TRUE(!bloom1.MayContain("x"));
   ASSERT_TRUE(!bloom1.MayContain("foo"));
 
-  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
   bloom2.Add("hello");
   bloom2.Add("world");
   ASSERT_TRUE(bloom2.MayContain("hello"));
@@ -79,7 +90,7 @@ TEST_F(DynamicBloomTest, Small) {
 
 TEST_F(DynamicBloomTest, SmallConcurrentAdd) {
   Arena arena;
-  DynamicBloom bloom1(&arena, 100, 0, 2);
+  DynamicBloom bloom1(&arena, 100, 2);
   bloom1.AddConcurrently("hello");
   bloom1.AddConcurrently("world");
   ASSERT_TRUE(bloom1.MayContain("hello"));
@@ -87,7 +98,7 @@ TEST_F(DynamicBloomTest, SmallConcurrentAdd) {
   ASSERT_TRUE(!bloom1.MayContain("x"));
   ASSERT_TRUE(!bloom1.MayContain("foo"));
 
-  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 1, 2);
+  DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2);
   bloom2.AddConcurrently("hello");
   bloom2.AddConcurrently("world");
   ASSERT_TRUE(bloom2.MayContain("hello"));
@@ -104,13 +115,13 @@ static uint32_t NextNum(uint32_t num) {
   } else if (num < 1000) {
     num += 100;
   } else {
-    num += 1000;
+    num = num * 26 / 10;
   }
   return num;
 }
 
 TEST_F(DynamicBloomTest, VaryingLengths) {
-  char buffer[sizeof(uint64_t)];
+  KeyMaker km;
 
   // Count number of filters that significantly exceed the false positive rate
   int mediocre_filters = 0;
@@ -120,56 +131,53 @@ TEST_F(DynamicBloomTest, VaryingLengths) {
   fprintf(stderr, "bits_per_key: %d  num_probes: %d\n", FLAGS_bits_per_key,
           num_probes);
 
-  for (uint32_t enable_locality = 0; enable_locality < 2; ++enable_locality) {
-    for (uint32_t num = 1; num <= 10000; num = NextNum(num)) {
+  // NB: FP rate impact of 32-bit hash is noticeable starting around 10M keys.
+  // But that effect is hidden if using sequential keys (unique hashes).
+  for (bool nonseq : {false, true}) {
+    const uint32_t max_num = FLAGS_enable_perf ? 40000000 : 400000;
+    for (uint32_t num = 1; num <= max_num; num = NextNum(num)) {
       uint32_t bloom_bits = 0;
       Arena arena;
-      if (enable_locality == 0) {
-        bloom_bits = std::max(num * FLAGS_bits_per_key, 64U);
-      } else {
-        bloom_bits = std::max(num * FLAGS_bits_per_key,
-                              enable_locality * CACHE_LINE_SIZE * 8);
-      }
-      DynamicBloom bloom(&arena, bloom_bits, enable_locality, num_probes);
+      bloom_bits = num * FLAGS_bits_per_key;
+      DynamicBloom bloom(&arena, bloom_bits, num_probes);
       for (uint64_t i = 0; i < num; i++) {
-        bloom.Add(Key(i, buffer));
-        ASSERT_TRUE(bloom.MayContain(Key(i, buffer)));
+        bloom.Add(km.Key(i, nonseq));
+        ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
       }
 
       // All added keys must match
       for (uint64_t i = 0; i < num; i++) {
-        ASSERT_TRUE(bloom.MayContain(Key(i, buffer))) << "Num " << num
-                                                      << "; key " << i;
+        ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq)));
       }
 
       // Check false positive rate
-
       int result = 0;
-      for (uint64_t i = 0; i < 10000; i++) {
-        if (bloom.MayContain(Key(i + 1000000000, buffer))) {
+      for (uint64_t i = 0; i < 30000; i++) {
+        if (bloom.MayContain(km.Key(i + 1000000000, nonseq))) {
           result++;
         }
       }
-      double rate = result / 10000.0;
+      double rate = result / 30000.0;
 
       fprintf(stderr,
-              "False positives: %5.2f%% @ num = %6u, bloom_bits = %6u, "
-              "enable locality?%u\n",
-              rate * 100.0, num, bloom_bits, enable_locality);
+              "False positives (%s keys): "
+              "%5.2f%% @ num = %6u, bloom_bits = %6u\n",
+              nonseq ? "nonseq" : "seq", rate * 100.0, num, bloom_bits);
 
       if (rate > 0.0125)
         mediocre_filters++;  // Allowed, but not too often
       else
         good_filters++;
     }
-
-    fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
-            mediocre_filters);
-    ASSERT_LE(mediocre_filters, good_filters / 5);
   }
+
+  fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters,
+          mediocre_filters);
+  ASSERT_LE(mediocre_filters, good_filters / 25);
 }
 
 TEST_F(DynamicBloomTest, perf) {
+  KeyMaker km;
   StopWatchNano timer(Env::Default());
   uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
 
@@ -182,149 +190,125 @@ TEST_F(DynamicBloomTest, perf) {
     const uint32_t num_keys = m * 8 * 1024 * 1024;
     fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
 
-    DynamicBloom std_bloom(&arena, num_keys * 10, 0, num_probes);
+    DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
 
     timer.Start();
     for (uint64_t i = 1; i <= num_keys; ++i) {
-      std_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
+      std_bloom.Add(km.Seq(i));
     }
 
     uint64_t elapsed = timer.ElapsedNanos();
-    fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n",
-            elapsed / num_keys);
+    fprintf(stderr, "dynamic bloom, avg add latency %3g\n",
+            static_cast<double>(elapsed) / num_keys);
 
     uint32_t count = 0;
     timer.Start();
     for (uint64_t i = 1; i <= num_keys; ++i) {
-      if (std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8))) {
+      if (std_bloom.MayContain(km.Seq(i))) {
         ++count;
       }
     }
     ASSERT_EQ(count, num_keys);
     elapsed = timer.ElapsedNanos();
     assert(count > 0);
-    fprintf(stderr, "standard bloom, avg query latency %" PRIu64 "\n",
-            elapsed / count);
-
-    // Locality enabled version
-    DynamicBloom blocked_bloom(&arena, num_keys * 10, 1, num_probes);
-
-    timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
-      blocked_bloom.Add(Slice(reinterpret_cast<const char*>(&i), 8));
-    }
-
-    elapsed = timer.ElapsedNanos();
-    fprintf(stderr,
-            "blocked bloom(enable locality), avg add latency %" PRIu64 "\n",
-            elapsed / num_keys);
-
-    count = 0;
-    timer.Start();
-    for (uint64_t i = 1; i <= num_keys; ++i) {
-      if (blocked_bloom.MayContain(
-              Slice(reinterpret_cast<const char*>(&i), 8))) {
-        ++count;
-      }
-    }
-
-    elapsed = timer.ElapsedNanos();
-    assert(count > 0);
-    fprintf(stderr,
-            "blocked bloom(enable locality), avg query latency %" PRIu64 "\n",
-            elapsed / count);
-    ASSERT_TRUE(count == num_keys);
+    fprintf(stderr, "dynamic bloom, avg query latency %3g\n",
+            static_cast<double>(elapsed) / count);
   }
 }
 
 TEST_F(DynamicBloomTest, concurrent_with_perf) {
-  StopWatchNano timer(Env::Default());
   uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
 
   uint32_t m_limit = FLAGS_enable_perf ? 8 : 1;
-  uint32_t locality_limit = FLAGS_enable_perf ? 1 : 0;
 
   uint32_t num_threads = 4;
   std::vector<port::Thread> threads;
 
+  // NB: Uses sequential keys for speed, but that hides the FP rate
+  // impact of 32-bit hash, which is noticeable starting around 10M keys
+  // when they vary across hashing blocks.
   for (uint32_t m = 1; m <= m_limit; ++m) {
-    for (uint32_t locality = 0; locality <= locality_limit; ++locality) {
-      Arena arena;
-      const uint32_t num_keys = m * 8 * 1024 * 1024;
-      fprintf(stderr, "testing %" PRIu32 "M keys with %" PRIu32 " locality\n",
-              m * 8, locality);
+    Arena arena;
+    const uint32_t num_keys = m * 8 * 1024 * 1024;
+    fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8);
 
-      DynamicBloom std_bloom(&arena, num_keys * 10, locality, num_probes);
+    DynamicBloom std_bloom(&arena, num_keys * 10, num_probes);
 
-      timer.Start();
+    std::atomic<uint64_t> elapsed(0);
 
-      std::function<void(size_t)> adder([&](size_t t) {
-        for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
-          std_bloom.AddConcurrently(
-              Slice(reinterpret_cast<const char*>(&i), 8));
-        }
-      });
-      for (size_t t = 0; t < num_threads; ++t) {
-        threads.emplace_back(adder, t);
-      }
-      while (threads.size() > 0) {
-        threads.back().join();
-        threads.pop_back();
+    std::function<void(size_t)> adder([&](size_t t) {
+      KeyMaker km;
+      StopWatchNano timer(Env::Default());
+      timer.Start();
+      for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+        std_bloom.AddConcurrently(km.Seq(i));
       }
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(adder, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
 
-      uint64_t elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "standard bloom, avg parallel add latency %" PRIu64
-                      " nanos/key\n",
-              elapsed / num_keys);
-
+    fprintf(stderr,
+            "dynamic bloom, avg parallel add latency %3g"
+            " nanos/key\n",
+            static_cast<double>(elapsed) / num_threads / num_keys);
+
+    elapsed = 0;
+    std::function<void(size_t)> hitter([&](size_t t) {
+      KeyMaker km;
+      StopWatchNano timer(Env::Default());
       timer.Start();
-
-      std::function<void(size_t)> hitter([&](size_t t) {
-        for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
-          bool f =
-              std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
-          ASSERT_TRUE(f);
-        }
-      });
-      for (size_t t = 0; t < num_threads; ++t) {
-        threads.emplace_back(hitter, t);
+      for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
+        bool f = std_bloom.MayContain(km.Seq(i));
+        ASSERT_TRUE(f);
       }
-      while (threads.size() > 0) {
-        threads.back().join();
-        threads.pop_back();
-      }
-
-      elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "standard bloom, avg parallel hit latency %" PRIu64
-                      " nanos/key\n",
-              elapsed / num_keys);
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(hitter, t);
+    }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
 
+    fprintf(stderr,
+            "dynamic bloom, avg parallel hit latency %3g"
+            " nanos/key\n",
+            static_cast<double>(elapsed) / num_threads / num_keys);
+
+    elapsed = 0;
+    std::atomic<uint32_t> false_positives(0);
+    std::function<void(size_t)> misser([&](size_t t) {
+      KeyMaker km;
+      StopWatchNano timer(Env::Default());
       timer.Start();
-
-      std::atomic<uint32_t> false_positives(0);
-      std::function<void(size_t)> misser([&](size_t t) {
-        for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
-             i += num_threads) {
-          bool f =
-              std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
-          if (f) {
-            ++false_positives;
-          }
+      for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; i += num_threads) {
+        bool f = std_bloom.MayContain(km.Seq(i));
+        if (f) {
+          ++false_positives;
         }
-      });
-      for (size_t t = 0; t < num_threads; ++t) {
-        threads.emplace_back(misser, t);
-      }
-      while (threads.size() > 0) {
-        threads.back().join();
-        threads.pop_back();
       }
-
-      elapsed = timer.ElapsedNanos();
-      fprintf(stderr, "standard bloom, avg parallel miss latency %" PRIu64
-                      " nanos/key, %f%% false positive rate\n",
-              elapsed / num_keys, false_positives.load() * 100.0 / num_keys);
+      elapsed += timer.ElapsedNanos();
+    });
+    for (size_t t = 0; t < num_threads; ++t) {
+      threads.emplace_back(misser, t);
     }
+    while (threads.size() > 0) {
+      threads.back().join();
+      threads.pop_back();
+    }
+
+    fprintf(stderr,
+            "dynamic bloom, avg parallel miss latency %3g"
+            " nanos/key, %f%% false positive rate\n",
+            static_cast<double>(elapsed) / num_threads / num_keys,
+            false_positives.load() * 100.0 / num_keys);
   }
 }
 
diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc
deleted file mode 100644
index 9a818cb0f07..00000000000
--- a/util/file_reader_writer.cc
+++ /dev/null
@@ -1,867 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#include "util/file_reader_writer.h"
-
-#include <algorithm>
-#include <mutex>
-
-#include "monitoring/histogram.h"
-#include "monitoring/iostats_context_imp.h"
-#include "port/port.h"
-#include "util/random.h"
-#include "util/rate_limiter.h"
-#include "util/sync_point.h"
-
-namespace rocksdb {
-
-#ifndef NDEBUG
-namespace {
-bool IsFileSectorAligned(const size_t off, size_t sector_size) {
-  return off % sector_size == 0;
-}
-}
-#endif
-
-Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
-  Status s;
-  if (use_direct_io()) {
-#ifndef ROCKSDB_LITE
-    size_t offset = offset_.fetch_add(n);
-    size_t alignment = file_->GetRequiredBufferAlignment();
-    size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
-    size_t offset_advance = offset - aligned_offset;
-    size_t size = Roundup(offset + n, alignment) - aligned_offset;
-    size_t r = 0;
-    AlignedBuffer buf;
-    buf.Alignment(alignment);
-    buf.AllocateNewBuffer(size);
-    Slice tmp;
-    s = file_->PositionedRead(aligned_offset, size, &tmp, buf.BufferStart());
-    if (s.ok() && offset_advance < tmp.size()) {
-      buf.Size(tmp.size());
-      r = buf.Read(scratch, offset_advance,
-                   std::min(tmp.size() - offset_advance, n));
-    }
-    *result = Slice(scratch, r);
-#endif  // !ROCKSDB_LITE
-  } else {
-    s = file_->Read(n, result, scratch);
-  }
-  IOSTATS_ADD(bytes_read, result->size());
-  return s;
-}
-
-
-Status SequentialFileReader::Skip(uint64_t n) {
-#ifndef ROCKSDB_LITE
-  if (use_direct_io()) {
-    offset_ += static_cast<size_t>(n);
-    return Status::OK();
-  }
-#endif  // !ROCKSDB_LITE
-  return file_->Skip(n);
-}
-
-Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
-                                    char* scratch) const {
-  Status s;
-  uint64_t elapsed = 0;
-  {
-    StopWatch sw(env_, stats_, hist_type_,
-                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
-                true /*delay_enabled*/);
-    auto prev_perf_level = GetPerfLevel();
-    IOSTATS_TIMER_GUARD(read_nanos);
-    if (use_direct_io()) {
-#ifndef ROCKSDB_LITE
-      size_t alignment = file_->GetRequiredBufferAlignment();
-      size_t aligned_offset = TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
-      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
-      size_t read_size = Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
-      AlignedBuffer buf;
-      buf.Alignment(alignment);
-      buf.AllocateNewBuffer(read_size);
-      while (buf.CurrentSize() < read_size) {
-        size_t allowed;
-        if (for_compaction_ && rate_limiter_ != nullptr) {
-          allowed = rate_limiter_->RequestToken(
-              buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
-              Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
-        } else {
-          assert(buf.CurrentSize() == 0);
-          allowed = read_size;
-        }
-        Slice tmp;
-
-        FileOperationInfo::TimePoint start_ts;
-        uint64_t orig_offset = 0;
-        if (ShouldNotifyListeners()) {
-          start_ts = std::chrono::system_clock::now();
-          orig_offset = aligned_offset + buf.CurrentSize();
-        }
-        {
-          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
-          s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
-                          buf.Destination());
-        }
-        if (ShouldNotifyListeners()) {
-          auto finish_ts = std::chrono::system_clock::now();
-          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
-                                 s);
-        }
-
-        buf.Size(buf.CurrentSize() + tmp.size());
-        if (!s.ok() || tmp.size() < allowed) {
-          break;
-        }
-      }
-      size_t res_len = 0;
-      if (s.ok() && offset_advance < buf.CurrentSize()) {
-        res_len = buf.Read(scratch, offset_advance,
-                           std::min(buf.CurrentSize() - offset_advance, n));
-      }
-      *result = Slice(scratch, res_len);
-#endif  // !ROCKSDB_LITE
-    } else {
-      size_t pos = 0;
-      const char* res_scratch = nullptr;
-      while (pos < n) {
-        size_t allowed;
-        if (for_compaction_ && rate_limiter_ != nullptr) {
-          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
-            sw.DelayStart();
-          }
-          allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
-                                                Env::IOPriority::IO_LOW, stats_,
-                                                RateLimiter::OpType::kRead);
-          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
-            sw.DelayStop();
-          }
-        } else {
-          allowed = n;
-        }
-        Slice tmp_result;
-
-#ifndef ROCKSDB_LITE
-        FileOperationInfo::TimePoint start_ts;
-        if (ShouldNotifyListeners()) {
-          start_ts = std::chrono::system_clock::now();
-        }
-#endif
-        {
-          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
-          s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
-        }
-#ifndef ROCKSDB_LITE
-        if (ShouldNotifyListeners()) {
-          auto finish_ts = std::chrono::system_clock::now();
-          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
-                                 finish_ts, s);
-        }
-#endif
-
-        if (res_scratch == nullptr) {
-          // we can't simply use `scratch` because reads of mmap'd files return
-          // data in a different buffer.
-          res_scratch = tmp_result.data();
-        } else {
-          // make sure chunks are inserted contiguously into `res_scratch`.
-          assert(tmp_result.data() == res_scratch + pos);
-        }
-        pos += tmp_result.size();
-        if (!s.ok() || tmp_result.size() < allowed) {
-          break;
-        }
-      }
-      *result = Slice(res_scratch, s.ok() ? pos : 0);
-    }
-    IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
-    SetPerfLevel(prev_perf_level);
-  }
-  if (stats_ != nullptr && file_read_hist_ != nullptr) {
-    file_read_hist_->Add(elapsed);
-  }
-
-  return s;
-}
-
-Status WritableFileWriter::Append(const Slice& data) {
-  const char* src = data.data();
-  size_t left = data.size();
-  Status s;
-  pending_sync_ = true;
-
-  TEST_KILL_RANDOM("WritableFileWriter::Append:0",
-                   rocksdb_kill_odds * REDUCE_ODDS2);
-
-  {
-    IOSTATS_TIMER_GUARD(prepare_write_nanos);
-    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
-    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
-  }
-
-  // See whether we need to enlarge the buffer to avoid the flush
-  if (buf_.Capacity() - buf_.CurrentSize() < left) {
-    for (size_t cap = buf_.Capacity();
-         cap < max_buffer_size_;  // There is still room to increase
-         cap *= 2) {
-      // See whether the next available size is large enough.
-      // Buffer will never be increased to more than max_buffer_size_.
-      size_t desired_capacity = std::min(cap * 2, max_buffer_size_);
-      if (desired_capacity - buf_.CurrentSize() >= left ||
-          (use_direct_io() && desired_capacity == max_buffer_size_)) {
-        buf_.AllocateNewBuffer(desired_capacity, true);
-        break;
-      }
-    }
-  }
-
-  // Flush only when buffered I/O
-  if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) {
-    if (buf_.CurrentSize() > 0) {
-      s = Flush();
-      if (!s.ok()) {
-        return s;
-      }
-    }
-    assert(buf_.CurrentSize() == 0);
-  }
-
-  // We never write directly to disk with direct I/O on.
-  // or we simply use it for its original purpose to accumulate many small
-  // chunks
-  if (use_direct_io() || (buf_.Capacity() >= left)) {
-    while (left > 0) {
-      size_t appended = buf_.Append(src, left);
-      left -= appended;
-      src += appended;
-
-      if (left > 0) {
-        s = Flush();
-        if (!s.ok()) {
-          break;
-        }
-      }
-    }
-  } else {
-    // Writing directly to file bypassing the buffer
-    assert(buf_.CurrentSize() == 0);
-    s = WriteBuffered(src, left);
-  }
-
-  TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
-  if (s.ok()) {
-    filesize_ += data.size();
-  }
-  return s;
-}
-
-Status WritableFileWriter::Pad(const size_t pad_bytes) {
-  assert(pad_bytes < kDefaultPageSize);
-  size_t left = pad_bytes;
-  size_t cap = buf_.Capacity() - buf_.CurrentSize();
-
-  // Assume pad_bytes is small compared to buf_ capacity. So we always
-  // use buf_ rather than write directly to file in certain cases like
-  // Append() does.
-  while (left) {
-    size_t append_bytes = std::min(cap, left);
-    buf_.PadWith(append_bytes, 0);
-    left -= append_bytes;
-    if (left > 0) {
-      Status s = Flush();
-      if (!s.ok()) {
-        return s;
-      }
-    }
-    cap = buf_.Capacity() - buf_.CurrentSize();
-  }
-  pending_sync_ = true;
-  filesize_ += pad_bytes;
-  return Status::OK();
-}
-
-Status WritableFileWriter::Close() {
-
-  // Do not quit immediately on failure the file MUST be closed
-  Status s;
-
-  // Possible to close it twice now as we MUST close
-  // in __dtor, simply flushing is not enough
-  // Windows when pre-allocating does not fill with zeros
-  // also with unbuffered access we also set the end of data.
-  if (!writable_file_) {
-    return s;
-  }
-
-  s = Flush();  // flush cache to OS
-
-  Status interim;
-  // In direct I/O mode we write whole pages so
-  // we need to let the file know where data ends.
-  if (use_direct_io()) {
-    interim = writable_file_->Truncate(filesize_);
-    if (interim.ok()) {
-      interim = writable_file_->Fsync();
-    }
-    if (!interim.ok() && s.ok()) {
-      s = interim;
-    }
-  }
-
-  TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
-  interim = writable_file_->Close();
-  if (!interim.ok() && s.ok()) {
-    s = interim;
-  }
-
-  writable_file_.reset();
-  TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
-
-  return s;
-}
-
-// write out the cached data to the OS cache or storage if direct I/O
-// enabled
-Status WritableFileWriter::Flush() {
-  Status s;
-  TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
-                   rocksdb_kill_odds * REDUCE_ODDS2);
-
-  if (buf_.CurrentSize() > 0) {
-    if (use_direct_io()) {
-#ifndef ROCKSDB_LITE
-      if (pending_sync_) {
-        s = WriteDirect();
-      }
-#endif  // !ROCKSDB_LITE
-    } else {
-      s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
-    }
-    if (!s.ok()) {
-      return s;
-    }
-  }
-
-  s = writable_file_->Flush();
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  // sync OS cache to disk for every bytes_per_sync_
-  // TODO: give log file and sst file different options (log
-  // files could be potentially cached in OS for their whole
-  // life time, thus we might not want to flush at all).
-
-  // We try to avoid sync to the last 1MB of data. For two reasons:
-  // (1) avoid rewrite the same page that is modified later.
-  // (2) for older version of OS, write can block while writing out
-  //     the page.
-  // Xfs does neighbor page flushing outside of the specified ranges. We
-  // need to make sure sync range is far from the write offset.
-  if (!use_direct_io() && bytes_per_sync_) {
-    const uint64_t kBytesNotSyncRange = 1024 * 1024;  // recent 1MB is not synced.
-    const uint64_t kBytesAlignWhenSync = 4 * 1024;    // Align 4KB.
-    if (filesize_ > kBytesNotSyncRange) {
-      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
-      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
-      assert(offset_sync_to >= last_sync_size_);
-      if (offset_sync_to > 0 &&
-          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
-        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
-        last_sync_size_ = offset_sync_to;
-      }
-    }
-  }
-
-  return s;
-}
-
-Status WritableFileWriter::Sync(bool use_fsync) {
-  Status s = Flush();
-  if (!s.ok()) {
-    return s;
-  }
-  TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
-  if (!use_direct_io() && pending_sync_) {
-    s = SyncInternal(use_fsync);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
-  pending_sync_ = false;
-  return Status::OK();
-}
-
-Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
-  if (!writable_file_->IsSyncThreadSafe()) {
-    return Status::NotSupported(
-      "Can't WritableFileWriter::SyncWithoutFlush() because "
-      "WritableFile::IsSyncThreadSafe() is false");
-  }
-  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
-  Status s = SyncInternal(use_fsync);
-  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
-  return s;
-}
-
-Status WritableFileWriter::SyncInternal(bool use_fsync) {
-  Status s;
-  IOSTATS_TIMER_GUARD(fsync_nanos);
-  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
-  auto prev_perf_level = GetPerfLevel();
-  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
-  if (use_fsync) {
-    s = writable_file_->Fsync();
-  } else {
-    s = writable_file_->Sync();
-  }
-  SetPerfLevel(prev_perf_level);
-  return s;
-}
-
-Status WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) {
-  IOSTATS_TIMER_GUARD(range_sync_nanos);
-  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
-  return writable_file_->RangeSync(offset, nbytes);
-}
-
-// This method writes to disk the specified data and makes use of the rate
-// limiter if available
-Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
-  Status s;
-  assert(!use_direct_io());
-  const char* src = data;
-  size_t left = size;
-
-  while (left > 0) {
-    size_t allowed;
-    if (rate_limiter_ != nullptr) {
-      allowed = rate_limiter_->RequestToken(
-          left, 0 /* alignment */, writable_file_->GetIOPriority(), stats_,
-          RateLimiter::OpType::kWrite);
-    } else {
-      allowed = left;
-    }
-
-    {
-      IOSTATS_TIMER_GUARD(write_nanos);
-      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
-
-#ifndef ROCKSDB_LITE
-      FileOperationInfo::TimePoint start_ts;
-      uint64_t old_size = writable_file_->GetFileSize();
-      if (ShouldNotifyListeners()) {
-        start_ts = std::chrono::system_clock::now();
-        old_size = next_write_offset_;
-      }
-#endif
-      {
-        auto prev_perf_level = GetPerfLevel();
-        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
-        s = writable_file_->Append(Slice(src, allowed));
-        SetPerfLevel(prev_perf_level);
-      }
-#ifndef ROCKSDB_LITE
-      if (ShouldNotifyListeners()) {
-        auto finish_ts = std::chrono::system_clock::now();
-        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
-      }
-#endif
-      if (!s.ok()) {
-        return s;
-      }
-    }
-
-    IOSTATS_ADD(bytes_written, allowed);
-    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
-
-    left -= allowed;
-    src += allowed;
-  }
-  buf_.Size(0);
-  return s;
-}
-
-
-// This flushes the accumulated data in the buffer. We pad data with zeros if
-// necessary to the whole page.
-// However, during automatic flushes padding would not be necessary.
-// We always use RateLimiter if available. We move (Refit) any buffer bytes
-// that are left over the
-// whole number of pages to be written again on the next flush because we can
-// only write on aligned
-// offsets.
-#ifndef ROCKSDB_LITE
-Status WritableFileWriter::WriteDirect() {
-  assert(use_direct_io());
-  Status s;
-  const size_t alignment = buf_.Alignment();
-  assert((next_write_offset_ % alignment) == 0);
-
-  // Calculate whole page final file advance if all writes succeed
-  size_t file_advance =
-    TruncateToPageBoundary(alignment, buf_.CurrentSize());
-
-  // Calculate the leftover tail, we write it here padded with zeros BUT we
-  // will write
-  // it again in the future either on Close() OR when the current whole page
-  // fills out
-  size_t leftover_tail = buf_.CurrentSize() - file_advance;
-
-  // Round up and pad
-  buf_.PadToAlignmentWith(0);
-
-  const char* src = buf_.BufferStart();
-  uint64_t write_offset = next_write_offset_;
-  size_t left = buf_.CurrentSize();
-
-  while (left > 0) {
-    // Check how much is allowed
-    size_t size;
-    if (rate_limiter_ != nullptr) {
-      size = rate_limiter_->RequestToken(left, buf_.Alignment(),
-                                         writable_file_->GetIOPriority(),
-                                         stats_, RateLimiter::OpType::kWrite);
-    } else {
-      size = left;
-    }
-
-    {
-      IOSTATS_TIMER_GUARD(write_nanos);
-      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
-      FileOperationInfo::TimePoint start_ts;
-      if (ShouldNotifyListeners()) {
-        start_ts = std::chrono::system_clock::now();
-      }
-      // direct writes must be positional
-      s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
-      if (ShouldNotifyListeners()) {
-        auto finish_ts = std::chrono::system_clock::now();
-        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
-      }
-      if (!s.ok()) {
-        buf_.Size(file_advance + leftover_tail);
-        return s;
-      }
-    }
-
-    IOSTATS_ADD(bytes_written, size);
-    left -= size;
-    src += size;
-    write_offset += size;
-    assert((next_write_offset_ % alignment) == 0);
-  }
-
-  if (s.ok()) {
-    // Move the tail to the beginning of the buffer
-    // This never happens during normal Append but rather during
-    // explicit call to Flush()/Sync() or Close()
-    buf_.RefitTail(file_advance, leftover_tail);
-    // This is where we start writing next time which may or not be
-    // the actual file size on disk. They match if the buffer size
-    // is a multiple of whole pages otherwise filesize_ is leftover_tail
-    // behind
-    next_write_offset_ += file_advance;
-  }
-  return s;
-}
-#endif  // !ROCKSDB_LITE
-
-namespace {
-class ReadaheadRandomAccessFile : public RandomAccessFile {
- public:
-  ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
-                            size_t readahead_size)
-      : file_(std::move(file)),
-        alignment_(file_->GetRequiredBufferAlignment()),
-        readahead_size_(Roundup(readahead_size, alignment_)),
-        buffer_(),
-        buffer_offset_(0) {
-    buffer_.Alignment(alignment_);
-    buffer_.AllocateNewBuffer(readahead_size_);
-  }
-
- ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
-
- ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete;
-
- Status Read(uint64_t offset, size_t n, Slice* result,
-             char* scratch) const override {
-   if (n + alignment_ >= readahead_size_) {
-     return file_->Read(offset, n, result, scratch);
-   }
-
-   std::unique_lock<std::mutex> lk(lock_);
-
-   size_t cached_len = 0;
-   // Check if there is a cache hit, means that [offset, offset + n) is either
-   // completely or partially in the buffer
-   // If it's completely cached, including end of file case when offset + n is
-   // greater than EOF, return
-   if (TryReadFromCache(offset, n, &cached_len, scratch) &&
-       (cached_len == n ||
-        // End of file
-        buffer_.CurrentSize() < readahead_size_)) {
-     *result = Slice(scratch, cached_len);
-     return Status::OK();
-   }
-   size_t advanced_offset = static_cast<size_t>(offset + cached_len);
-   // In the case of cache hit advanced_offset is already aligned, means that
-   // chunk_offset equals to advanced_offset
-   size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
-   Slice readahead_result;
-
-   Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
-   if (s.ok()) {
-     // In the case of cache miss, i.e. when cached_len equals 0, an offset can
-     // exceed the file end position, so the following check is required
-     if (advanced_offset < chunk_offset + buffer_.CurrentSize()) {
-       // In the case of cache miss, the first chunk_padding bytes in buffer_
-       // are
-       // stored for alignment only and must be skipped
-       size_t chunk_padding = advanced_offset - chunk_offset;
-       auto remaining_len =
-           std::min(buffer_.CurrentSize() - chunk_padding, n - cached_len);
-       memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding,
-              remaining_len);
-       *result = Slice(scratch, cached_len + remaining_len);
-     } else {
-       *result = Slice(scratch, cached_len);
-     }
-   }
-   return s;
- }
-
- Status Prefetch(uint64_t offset, size_t n) override {
-   if (n < readahead_size_) {
-     // Don't allow smaller prefetches than the configured `readahead_size_`.
-     // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
-     return Status::OK();
-   }
-   size_t offset_ = static_cast<size_t>(offset);
-   size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
-   if (prefetch_offset == buffer_offset_) {
-     return Status::OK();
-   }
-   return ReadIntoBuffer(prefetch_offset,
-                         Roundup(offset_ + n, alignment_) - prefetch_offset);
- }
-
- size_t GetUniqueId(char* id, size_t max_size) const override {
-   return file_->GetUniqueId(id, max_size);
- }
-
- void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
-
- Status InvalidateCache(size_t offset, size_t length) override {
-   return file_->InvalidateCache(offset, length);
- }
-
- bool use_direct_io() const override { return file_->use_direct_io(); }
-
-private:
- bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
-                       char* scratch) const {
-   if (offset < buffer_offset_ ||
-       offset >= buffer_offset_ + buffer_.CurrentSize()) {
-     *cached_len = 0;
-     return false;
-   }
-   uint64_t offset_in_buffer = offset - buffer_offset_;
-   *cached_len = std::min(
-       buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
-   memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
-   return true;
-  }
-
-  Status ReadIntoBuffer(uint64_t offset, size_t n) const {
-    if (n > buffer_.Capacity()) {
-      n = buffer_.Capacity();
-    }
-    assert(IsFileSectorAligned(offset, alignment_));
-    assert(IsFileSectorAligned(n, alignment_));
-    Slice result;
-    Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
-    if (s.ok()) {
-      buffer_offset_ = offset;
-      buffer_.Size(result.size());
-      assert(buffer_.BufferStart() == result.data());
-    }
-    return s;
-  }
-
-  std::unique_ptr<RandomAccessFile> file_;
-  const size_t alignment_;
-  size_t               readahead_size_;
-
-  mutable std::mutex lock_;
-  mutable AlignedBuffer buffer_;
-  mutable uint64_t buffer_offset_;
-};
-}  // namespace
-
-Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
-                                    uint64_t offset, size_t n) {
-  size_t alignment = reader->file()->GetRequiredBufferAlignment();
-  size_t offset_ = static_cast<size_t>(offset);
-  uint64_t rounddown_offset = Rounddown(offset_, alignment);
-  uint64_t roundup_end = Roundup(offset_ + n, alignment);
-  uint64_t roundup_len = roundup_end - rounddown_offset;
-  assert(roundup_len >= alignment);
-  assert(roundup_len % alignment == 0);
-
-  // Check if requested bytes are in the existing buffer_.
-  // If all bytes exist -- return.
-  // If only a few bytes exist -- reuse them & read only what is really needed.
-  //     This is typically the case of incremental reading of data.
-  // If no bytes exist in buffer -- full pread.
-
-  Status s;
-  uint64_t chunk_offset_in_buffer = 0;
-  uint64_t chunk_len = 0;
-  bool copy_data_to_new_buffer = false;
-  if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ &&
-      offset <= buffer_offset_ + buffer_.CurrentSize()) {
-    if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) {
-      // All requested bytes are already in the buffer. So no need to Read
-      // again.
-      return s;
-    } else {
-      // Only a few requested bytes are in the buffer. memmove those chunk of
-      // bytes to the beginning, and memcpy them back into the new buffer if a
-      // new buffer is created.
-      chunk_offset_in_buffer = Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
-      chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
-      assert(chunk_offset_in_buffer % alignment == 0);
-      assert(chunk_len % alignment == 0);
-      assert(chunk_offset_in_buffer + chunk_len <=
-             buffer_offset_ + buffer_.CurrentSize());
-      if (chunk_len > 0) {
-        copy_data_to_new_buffer = true;
-      } else {
-        // this reset is not necessary, but just to be safe.
-        chunk_offset_in_buffer = 0;
-      }
-    }
-  }
-
-  // Create a new buffer only if current capacity is not sufficient, and memcopy
-  // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
-  if (buffer_.Capacity() < roundup_len) {
-    buffer_.Alignment(alignment);
-    buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
-                              copy_data_to_new_buffer, chunk_offset_in_buffer,
-                              static_cast<size_t>(chunk_len));
-  } else if (chunk_len > 0) {
-    // New buffer not needed. But memmove bytes from tail to the beginning since
-    // chunk_len is greater than 0.
-    buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer), static_cast<size_t>(chunk_len));
-  }
-
-  Slice result;
-  s = reader->Read(rounddown_offset + chunk_len,
-                   static_cast<size_t>(roundup_len - chunk_len), &result,
-                   buffer_.BufferStart() + chunk_len);
-  if (s.ok()) {
-    buffer_offset_ = rounddown_offset;
-    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
-  }
-  return s;
-}
-
-bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
-                                          Slice* result) {
-  if (track_min_offset_ && offset < min_offset_read_) {
-    min_offset_read_ = static_cast<size_t>(offset);
-  }
-  if (!enable_ || offset < buffer_offset_) {
-    return false;
-  }
-
-  // If the buffer contains only a few of the requested bytes:
-  //    If readahead is enabled: prefetch the remaining bytes + readadhead bytes
-  //        and satisfy the request.
-  //    If readahead is not enabled: return false.
-  if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
-    if (readahead_size_ > 0) {
-      assert(file_reader_ != nullptr);
-      assert(max_readahead_size_ >= readahead_size_);
-
-      Status s = Prefetch(file_reader_, offset, n + readahead_size_);
-      if (!s.ok()) {
-        return false;
-      }
-      readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
-    } else {
-      return false;
-    }
-  }
-
-  uint64_t offset_in_buffer = offset - buffer_offset_;
-  *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
-  return true;
-}
-
-std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
-    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
-  std::unique_ptr<RandomAccessFile> result(
-    new ReadaheadRandomAccessFile(std::move(file), readahead_size));
-  return result;
-}
-
-Status NewWritableFile(Env* env, const std::string& fname,
-                       std::unique_ptr<WritableFile>* result,
-                       const EnvOptions& options) {
-  Status s = env->NewWritableFile(fname, result, options);
-  TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
-  return s;
-}
-
-bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
-                 std::string* output, bool* has_data, Status* result) {
-  const int kBufferSize = 8192;
-  char buffer[kBufferSize + 1];
-  Slice input_slice;
-
-  std::string line;
-  bool has_complete_line = false;
-  while (!has_complete_line) {
-    if (std::getline(*iss, line)) {
-      has_complete_line = !iss->eof();
-    } else {
-      has_complete_line = false;
-    }
-    if (!has_complete_line) {
-      // if we're not sure whether we have a complete line,
-      // further read from the file.
-      if (*has_data) {
-        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
-      }
-      if (input_slice.size() == 0) {
-        // meaning we have read all the data
-        *has_data = false;
-        break;
-      } else {
-        iss->str(line + input_slice.ToString());
-        // reset the internal state of iss so that we can keep reading it.
-        iss->clear();
-        *has_data = (input_slice.size() == kBufferSize);
-        continue;
-      }
-    }
-  }
-  *output = line;
-  return *has_data || has_complete_line;
-}
-
-}  // namespace rocksdb
diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h
deleted file mode 100644
index 4451f8b81bf..00000000000
--- a/util/file_reader_writer.h
+++ /dev/null
@@ -1,326 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#pragma once
-#include <atomic>
-#include <sstream>
-#include <string>
-#include "port/port.h"
-#include "rocksdb/env.h"
-#include "rocksdb/listener.h"
-#include "rocksdb/rate_limiter.h"
-#include "util/aligned_buffer.h"
-#include "util/sync_point.h"
-
-namespace rocksdb {
-
-class Statistics;
-class HistogramImpl;
-
-std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
-  std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
-
-class SequentialFileReader {
- private:
-  std::unique_ptr<SequentialFile> file_;
-  std::string file_name_;
-  std::atomic<size_t> offset_;  // read offset
-
- public:
-  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
-                                const std::string& _file_name)
-      : file_(std::move(_file)), file_name_(_file_name), offset_(0) {}
-
-  SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
-    *this = std::move(o);
-  }
-
-  SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
-    file_ = std::move(o.file_);
-    return *this;
-  }
-
-  SequentialFileReader(const SequentialFileReader&) = delete;
-  SequentialFileReader& operator=(const SequentialFileReader&) = delete;
-
-  Status Read(size_t n, Slice* result, char* scratch);
-
-  Status Skip(uint64_t n);
-
-  void Rewind();
-
-  SequentialFile* file() { return file_.get(); }
-
-  std::string file_name() { return file_name_; }
-
-  bool use_direct_io() const { return file_->use_direct_io(); }
-};
-
-class RandomAccessFileReader {
- private:
-#ifndef ROCKSDB_LITE
-  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
-                              const FileOperationInfo::TimePoint& start_ts,
-                              const FileOperationInfo::TimePoint& finish_ts,
-                              const Status& status) const {
-    FileOperationInfo info(file_name_, start_ts, finish_ts);
-    info.offset = offset;
-    info.length = length;
-    info.status = status;
-
-    for (auto& listener : listeners_) {
-      listener->OnFileReadFinish(info);
-    }
-  }
-#endif  // ROCKSDB_LITE
-
-  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
-
-  std::unique_ptr<RandomAccessFile> file_;
-  std::string     file_name_;
-  Env*            env_;
-  Statistics*     stats_;
-  uint32_t        hist_type_;
-  HistogramImpl*  file_read_hist_;
-  RateLimiter* rate_limiter_;
-  bool for_compaction_;
-  std::vector<std::shared_ptr<EventListener>> listeners_;
-
- public:
-  explicit RandomAccessFileReader(
-      std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name,
-      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
-      HistogramImpl* file_read_hist = nullptr,
-      RateLimiter* rate_limiter = nullptr, bool for_compaction = false,
-      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
-      : file_(std::move(raf)),
-        file_name_(std::move(_file_name)),
-        env_(env),
-        stats_(stats),
-        hist_type_(hist_type),
-        file_read_hist_(file_read_hist),
-        rate_limiter_(rate_limiter),
-        for_compaction_(for_compaction),
-        listeners_() {
-#ifndef ROCKSDB_LITE
-    std::for_each(listeners.begin(), listeners.end(),
-                  [this](const std::shared_ptr<EventListener>& e) {
-                    if (e->ShouldBeNotifiedOnFileIO()) {
-                      listeners_.emplace_back(e);
-                    }
-                  });
-#else  // !ROCKSDB_LITE
-    (void)listeners;
-#endif
-  }
-
-  RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
-    *this = std::move(o);
-  }
-
-  RandomAccessFileReader& operator=(RandomAccessFileReader&& o)
-      ROCKSDB_NOEXCEPT {
-    file_ = std::move(o.file_);
-    env_ = std::move(o.env_);
-    stats_ = std::move(o.stats_);
-    hist_type_ = std::move(o.hist_type_);
-    file_read_hist_ = std::move(o.file_read_hist_);
-    rate_limiter_ = std::move(o.rate_limiter_);
-    for_compaction_ = std::move(o.for_compaction_);
-    return *this;
-  }
-
-  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
-  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
-
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const;
-
-  Status Prefetch(uint64_t offset, size_t n) const {
-    return file_->Prefetch(offset, n);
-  }
-
-  RandomAccessFile* file() { return file_.get(); }
-
-  std::string file_name() const { return file_name_; }
-
-  bool use_direct_io() const { return file_->use_direct_io(); }
-};
-
-// Use posix write to write data to a file.
-class WritableFileWriter {
- private:
-#ifndef ROCKSDB_LITE
-  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
-                               const FileOperationInfo::TimePoint& start_ts,
-                               const FileOperationInfo::TimePoint& finish_ts,
-                               const Status& status) {
-    FileOperationInfo info(file_name_, start_ts, finish_ts);
-    info.offset = offset;
-    info.length = length;
-    info.status = status;
-
-    for (auto& listener : listeners_) {
-      listener->OnFileWriteFinish(info);
-    }
-  }
-#endif  // ROCKSDB_LITE
-
-  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
-
-  std::unique_ptr<WritableFile> writable_file_;
-  std::string file_name_;
-  Env* env_;
-  AlignedBuffer           buf_;
-  size_t                  max_buffer_size_;
-  // Actually written data size can be used for truncate
-  // not counting padding data
-  uint64_t                filesize_;
-#ifndef ROCKSDB_LITE
-  // This is necessary when we use unbuffered access
-  // and writes must happen on aligned offsets
-  // so we need to go back and write that page again
-  uint64_t                next_write_offset_;
-#endif  // ROCKSDB_LITE
-  bool                    pending_sync_;
-  uint64_t                last_sync_size_;
-  uint64_t                bytes_per_sync_;
-  RateLimiter*            rate_limiter_;
-  Statistics* stats_;
-  std::vector<std::shared_ptr<EventListener>> listeners_;
-
- public:
-  WritableFileWriter(
-      std::unique_ptr<WritableFile>&& file, const std::string& _file_name,
-      const EnvOptions& options, Env* env = nullptr,
-      Statistics* stats = nullptr,
-      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
-      : writable_file_(std::move(file)),
-        file_name_(_file_name),
-        env_(env),
-        buf_(),
-        max_buffer_size_(options.writable_file_max_buffer_size),
-        filesize_(0),
-#ifndef ROCKSDB_LITE
-        next_write_offset_(0),
-#endif  // ROCKSDB_LITE
-        pending_sync_(false),
-        last_sync_size_(0),
-        bytes_per_sync_(options.bytes_per_sync),
-        rate_limiter_(options.rate_limiter),
-        stats_(stats),
-        listeners_() {
-    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
-                             reinterpret_cast<void*>(max_buffer_size_));
-    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
-    buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
-#ifndef ROCKSDB_LITE
-    std::for_each(listeners.begin(), listeners.end(),
-                  [this](const std::shared_ptr<EventListener>& e) {
-                    if (e->ShouldBeNotifiedOnFileIO()) {
-                      listeners_.emplace_back(e);
-                    }
-                  });
-#else  // !ROCKSDB_LITE
-    (void)listeners;
-#endif
-  }
-
-  WritableFileWriter(const WritableFileWriter&) = delete;
-
-  WritableFileWriter& operator=(const WritableFileWriter&) = delete;
-
-  ~WritableFileWriter() { Close(); }
-
-  std::string file_name() const { return file_name_; }
-
-  Status Append(const Slice& data);
-
-  Status Pad(const size_t pad_bytes);
-
-  Status Flush();
-
-  Status Close();
-
-  Status Sync(bool use_fsync);
-
-  // Sync only the data that was already Flush()ed. Safe to call concurrently
-  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
-  // returns NotSupported status.
-  Status SyncWithoutFlush(bool use_fsync);
-
-  uint64_t GetFileSize() { return filesize_; }
-
-  Status InvalidateCache(size_t offset, size_t length) {
-    return writable_file_->InvalidateCache(offset, length);
-  }
-
-  WritableFile* writable_file() const { return writable_file_.get(); }
-
-  bool use_direct_io() { return writable_file_->use_direct_io(); }
-
-  bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }
-
- private:
-  // Used when os buffering is OFF and we are writing
-  // DMA such as in Direct I/O mode
-#ifndef ROCKSDB_LITE
-  Status WriteDirect();
-#endif  // !ROCKSDB_LITE
-  // Normal write
-  Status WriteBuffered(const char* data, size_t size);
-  Status RangeSync(uint64_t offset, uint64_t nbytes);
-  Status SyncInternal(bool use_fsync);
-};
-
-// FilePrefetchBuffer can automatically do the readahead if file_reader,
-// readahead_size, and max_readahead_size are passed in.
-// max_readahead_size should be greater than or equal to readahead_size.
-// readahead_size will be doubled on every IO, until max_readahead_size.
-class FilePrefetchBuffer {
- public:
-  // If `track_min_offset` is true, track minimum offset ever read.
-  FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
-                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
-                     bool enable = true, bool track_min_offset = false)
-      : buffer_offset_(0),
-        file_reader_(file_reader),
-        readahead_size_(readadhead_size),
-        max_readahead_size_(max_readahead_size),
-        min_offset_read_(port::kMaxSizet),
-        enable_(enable),
-        track_min_offset_(track_min_offset) {}
-  Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n);
-  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result);
-
-  // The minimum `offset` ever passed to TryReadFromCache(). Only be tracked
-  // if track_min_offset = true.
-  size_t min_offset_read() const { return min_offset_read_; }
-
- private:
-  AlignedBuffer buffer_;
-  uint64_t buffer_offset_;
-  RandomAccessFileReader* file_reader_;
-  size_t readahead_size_;
-  size_t max_readahead_size_;
-  // The minimum `offset` ever passed to TryReadFromCache().
-  size_t min_offset_read_;
-  // if false, TryReadFromCache() always return false, and we only take stats
-  // for track_min_offset_ if track_min_offset_ = true
-  bool enable_;
-  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
-  // can be fetched from min_offset_read().
-  bool track_min_offset_;
-};
-
-extern Status NewWritableFile(Env* env, const std::string& fname,
-                              std::unique_ptr<WritableFile>* result,
-                              const EnvOptions& options);
-bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
-                 std::string* output, bool* has_data, Status* result);
-
-}  // namespace rocksdb
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index 6a7ea6d7da4..9a07bccccd5 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -3,12 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "util/file_reader_writer.h"
 #include <algorithm>
 #include <vector>
+#include "file/random_access_file_reader.h"
+#include "file/readahead_raf.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -275,7 +278,7 @@ TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenLessThanReadaheadSizeTest) {
 }
 
 TEST_P(ReadaheadRandomAccessFileTest,
-       SourceStrLenCanBeGreaterThanReadaheadSizeTest) {
+       SourceStrLenGreaterThanReadaheadSizeTest) {
   Random rng(42);
   for (int k = 0; k < 100; ++k) {
     size_t strLen = k * GetReadaheadSize() +
@@ -286,13 +289,13 @@ TEST_P(ReadaheadRandomAccessFileTest,
     for (int test = 1; test <= 100; ++test) {
       size_t offset = rng.Uniform(static_cast<int>(strLen));
       size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize()));
-      ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)),
+      ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)),
                 Read(offset, n));
     }
   }
 }
 
-TEST_P(ReadaheadRandomAccessFileTest, NExceedReadaheadTest) {
+TEST_P(ReadaheadRandomAccessFileTest, ReadExceedsReadaheadSizeTest) {
   Random rng(7);
   size_t strLen = 4 * GetReadaheadSize() +
                   rng.Uniform(static_cast<int>(GetReadaheadSize()));
@@ -303,7 +306,7 @@ TEST_P(ReadaheadRandomAccessFileTest, NExceedReadaheadTest) {
     size_t offset = rng.Uniform(static_cast<int>(strLen));
     size_t n =
         GetReadaheadSize() + rng.Uniform(static_cast<int>(GetReadaheadSize()));
-    ASSERT_EQ(str.substr(offset, std::min(n, str.size() - offset)),
+    ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)),
               Read(offset, n));
   }
 }
@@ -315,13 +318,118 @@ INSTANTIATE_TEST_CASE_P(
     SourceStrLenLessThanReadaheadSizeTest, ReadaheadRandomAccessFileTest,
     ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
 INSTANTIATE_TEST_CASE_P(
-    SourceStrLenCanBeGreaterThanReadaheadSizeTest,
-    ReadaheadRandomAccessFileTest,
+    SourceStrLenGreaterThanReadaheadSizeTest, ReadaheadRandomAccessFileTest,
     ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
 INSTANTIATE_TEST_CASE_P(
-    NExceedReadaheadTest, ReadaheadRandomAccessFileTest,
+    ReadExceedsReadaheadSizeTest, ReadaheadRandomAccessFileTest,
     ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList()));
 
+class ReadaheadSequentialFileTest : public testing::Test,
+                                    public testing::WithParamInterface<size_t> {
+ public:
+  static std::vector<size_t> GetReadaheadSizeList() {
+    return {1lu << 8, 1lu << 12, 1lu << 16, 1lu << 18};
+  }
+  void SetUp() override {
+    readahead_size_ = GetParam();
+    scratch_.reset(new char[2 * readahead_size_]);
+    ResetSourceStr();
+  }
+  ReadaheadSequentialFileTest() {}
+  std::string Read(size_t n) {
+    Slice result;
+    test_read_holder_->Read(n, &result, scratch_.get());
+    return std::string(result.data(), result.size());
+  }
+  void Skip(size_t n) { test_read_holder_->Skip(n); }
+  void ResetSourceStr(const std::string& str = "") {
+    auto read_holder =
+        std::unique_ptr<SequentialFile>(new test::SeqStringSource(str));
+    test_read_holder_.reset(new SequentialFileReader(std::move(read_holder),
+                                                     "test", readahead_size_));
+  }
+  size_t GetReadaheadSize() const { return readahead_size_; }
+
+ private:
+  size_t readahead_size_;
+  std::unique_ptr<SequentialFileReader> test_read_holder_;
+  std::unique_ptr<char[]> scratch_;
+};
+
+TEST_P(ReadaheadSequentialFileTest, EmptySourceStrTest) {
+  ASSERT_EQ("", Read(0));
+  ASSERT_EQ("", Read(1));
+  ASSERT_EQ("", Read(13));
+}
+
+TEST_P(ReadaheadSequentialFileTest, SourceStrLenLessThanReadaheadSizeTest) {
+  std::string str = "abcdefghijklmnopqrs";
+  ResetSourceStr(str);
+  ASSERT_EQ(str.substr(0, 3), Read(3));
+  ASSERT_EQ(str.substr(3, 1), Read(1));
+  ASSERT_EQ(str.substr(4), Read(str.size()));
+  ASSERT_EQ("", Read(100));
+}
+
+TEST_P(ReadaheadSequentialFileTest, SourceStrLenGreaterThanReadaheadSizeTest) {
+  Random rng(42);
+  for (int s = 0; s < 1; ++s) {
+    for (int k = 0; k < 100; ++k) {
+      size_t strLen = k * GetReadaheadSize() +
+                      rng.Uniform(static_cast<int>(GetReadaheadSize()));
+      std::string str =
+          test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+      ResetSourceStr(str);
+      size_t offset = 0;
+      for (int test = 1; test <= 100; ++test) {
+        size_t n = rng.Uniform(static_cast<int>(GetReadaheadSize()));
+        if (s && test % 2) {
+          Skip(n);
+        } else {
+          ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n));
+        }
+        offset = std::min(offset + n, strLen);
+      }
+    }
+  }
+}
+
+TEST_P(ReadaheadSequentialFileTest, ReadExceedsReadaheadSizeTest) {
+  Random rng(42);
+  for (int s = 0; s < 1; ++s) {
+    for (int k = 0; k < 100; ++k) {
+      size_t strLen = k * GetReadaheadSize() +
+                      rng.Uniform(static_cast<int>(GetReadaheadSize()));
+      std::string str =
+          test::RandomHumanReadableString(&rng, static_cast<int>(strLen));
+      ResetSourceStr(str);
+      size_t offset = 0;
+      for (int test = 1; test <= 100; ++test) {
+        size_t n = GetReadaheadSize() +
+                   rng.Uniform(static_cast<int>(GetReadaheadSize()));
+        if (s && test % 2) {
+          Skip(n);
+        } else {
+          ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n));
+        }
+        offset = std::min(offset + n, strLen);
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    EmptySourceStrTest, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenLessThanReadaheadSizeTest, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    SourceStrLenGreaterThanReadaheadSizeTest, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+INSTANTIATE_TEST_CASE_P(
+    ReadExceedsReadaheadSizeTest, ReadaheadSequentialFileTest,
+    ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/util/filelock_test.cc b/util/filelock_test.cc
index f8721b5909a..3244563d7c3 100644
--- a/util/filelock_test.cc
+++ b/util/filelock_test.cc
@@ -6,10 +6,10 @@
 #include "rocksdb/status.h"
 #include "rocksdb/env.h"
 
-#include <vector>
 #include <fcntl.h>
+#include <vector>
+#include "test_util/testharness.h"
 #include "util/coding.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
new file mode 100644
index 00000000000..bbce72f494c
--- /dev/null
+++ b/util/filter_bench.cc
@@ -0,0 +1,679 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
+#include <cstdio>
+int main() {
+  fprintf(stderr, "filter_bench requires gflags and !ROCKSDB_LITE\n");
+  return 1;
+}
+#else
+
+#include <cinttypes>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "memory/arena.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/plain/plain_table_bloom.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_uint32(seed, 0, "Seed for random number generators");
+
+DEFINE_double(working_mem_size_mb, 200,
+              "MB of memory to get up to among all filters");
+
+DEFINE_uint32(average_keys_per_filter, 10000,
+              "Average number of keys per filter");
+
+DEFINE_uint32(key_size, 24, "Average number of bytes for each key");
+
+DEFINE_bool(vary_key_alignment, true,
+            "Whether to vary key alignment (default: at least 32-bit "
+            "alignment)");
+
+DEFINE_uint32(vary_key_size_log2_interval, 5,
+              "Use same key size 2^n times, then change. Key size varies from "
+              "-2 to +2 bytes vs. average, unless n>=30 to fix key size.");
+
+DEFINE_uint32(batch_size, 8, "Number of keys to group in each batch");
+
+DEFINE_double(bits_per_key, 10.0, "Bits per key setting for filters");
+
+DEFINE_double(m_queries, 200, "Millions of queries for each test mode");
+
+DEFINE_bool(use_full_block_reader, false,
+            "Use FullFilterBlockReader interface rather than FilterBitsReader");
+
+DEFINE_bool(use_plain_table_bloom, false,
+            "Use PlainTableBloom structure and interface rather than "
+            "FilterBitsReader/FullFilterBlockReader");
+
+DEFINE_uint32(impl, 0,
+              "Select filter implementation. Without -use_plain_table_bloom:"
+              "0 = full filter, 1 = block-based filter. With "
+              "-use_plain_table_bloom: 0 = no locality, 1 = locality.");
+
+DEFINE_bool(net_includes_hashing, false,
+            "Whether query net ns/op times should include hashing. "
+            "(if not, dry run will include hashing) "
+            "(build times always include hashing)");
+
+DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries");
+
+DEFINE_bool(best_case, false, "Run limited tests only for best-case");
+
+DEFINE_bool(allow_bad_fp_rate, false, "Continue even if FP rate is bad");
+
+DEFINE_bool(legend, false,
+            "Print more information about interpreting results instead of "
+            "running tests");
+
+void _always_assert_fail(int line, const char *file, const char *expr) {
+  fprintf(stderr, "%s: %d: Assertion %s failed\n", file, line, expr);
+  abort();
+}
+
+#define ALWAYS_ASSERT(cond) \
+  ((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond))
+
+using rocksdb::Arena;
+using rocksdb::BlockContents;
+using rocksdb::BloomFilterPolicy;
+using rocksdb::BloomHash;
+using rocksdb::CachableEntry;
+using rocksdb::EncodeFixed32;
+using rocksdb::fastrange32;
+using rocksdb::FilterBitsBuilder;
+using rocksdb::FilterBitsReader;
+using rocksdb::FilterBuildingContext;
+using rocksdb::FullFilterBlockReader;
+using rocksdb::GetSliceHash;
+using rocksdb::GetSliceHash64;
+using rocksdb::Lower32of64;
+using rocksdb::ParsedFullFilterBlock;
+using rocksdb::PlainTableBloomV1;
+using rocksdb::Random32;
+using rocksdb::Slice;
+using rocksdb::mock::MockBlockBasedTableTester;
+
+struct KeyMaker {
+  KeyMaker(size_t avg_size)
+      : smallest_size_(avg_size -
+                       (FLAGS_vary_key_size_log2_interval >= 30 ? 2 : 0)),
+        buf_size_(avg_size + 11),  // pad to vary key size and alignment
+        buf_(new char[buf_size_]) {
+    memset(buf_.get(), 0, buf_size_);
+    assert(smallest_size_ > 8);
+  }
+  size_t smallest_size_;
+  size_t buf_size_;
+  std::unique_ptr<char[]> buf_;
+
+  // Returns a unique(-ish) key based on the given parameter values. Each
+  // call returns a Slice from the same buffer so previously returned
+  // Slices should be considered invalidated.
+  Slice Get(uint32_t filter_num, uint32_t val_num) {
+    size_t start = FLAGS_vary_key_alignment ? val_num % 4 : 0;
+    size_t len = smallest_size_;
+    if (FLAGS_vary_key_size_log2_interval < 30) {
+      // To get range [avg_size - 2, avg_size + 2]
+      // use range [smallest_size, smallest_size + 4]
+      len += fastrange32(
+          (val_num >> FLAGS_vary_key_size_log2_interval) * 1234567891, 5);
+    }
+    char * data = buf_.get() + start;
+    // Populate key data such that all data makes it into a key of at
+    // least 8 bytes. We also don't want all the within-filter key
+    // variance confined to a contiguous 32 bits, because then a 32 bit
+    // hash function can "cheat" the false positive rate by
+    // approximating a perfect hash.
+    EncodeFixed32(data, val_num);
+    EncodeFixed32(data + 4, filter_num + val_num);
+    // ensure clearing leftovers from different alignment
+    EncodeFixed32(data + 8, 0);
+    return Slice(data, len);
+  }
+};
+
+void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+  fprintf(stdout,
+          "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
+#endif
+#ifndef NDEBUG
+  fprintf(stdout,
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+}
+
+struct FilterInfo {
+  uint32_t filter_id_ = 0;
+  std::unique_ptr<const char[]> owner_;
+  Slice filter_;
+  uint32_t keys_added_ = 0;
+  std::unique_ptr<FilterBitsReader> reader_;
+  std::unique_ptr<FullFilterBlockReader> full_block_reader_;
+  std::unique_ptr<PlainTableBloomV1> plain_table_bloom_;
+  uint64_t outside_queries_ = 0;
+  uint64_t false_positives_ = 0;
+};
+
+enum TestMode {
+  kSingleFilter,
+  kBatchPrepared,
+  kBatchUnprepared,
+  kFiftyOneFilter,
+  kEightyTwentyFilter,
+  kRandomFilter,
+};
+
+static const std::vector<TestMode> allTestModes = {
+    kSingleFilter,   kBatchPrepared,      kBatchUnprepared,
+    kFiftyOneFilter, kEightyTwentyFilter, kRandomFilter,
+};
+
+static const std::vector<TestMode> quickTestModes = {
+    kSingleFilter,
+    kRandomFilter,
+};
+
+static const std::vector<TestMode> bestCaseTestModes = {
+    kSingleFilter,
+};
+
+const char *TestModeToString(TestMode tm) {
+  switch (tm) {
+    case kSingleFilter:
+      return "Single filter";
+    case kBatchPrepared:
+      return "Batched, prepared";
+    case kBatchUnprepared:
+      return "Batched, unprepared";
+    case kFiftyOneFilter:
+      return "Skewed 50% in 1%";
+    case kEightyTwentyFilter:
+      return "Skewed 80% in 20%";
+    case kRandomFilter:
+      return "Random filter";
+  }
+  return "Bad TestMode";
+}
+
+// Do just enough to keep some data dependence for the
+// compiler / CPU
+static uint32_t DryRunNoHash(Slice &s) {
+  uint32_t sz = static_cast<uint32_t>(s.size());
+  if (sz >= 4) {
+    return sz + s.data()[3];
+  } else {
+    return sz;
+  }
+}
+
+static uint32_t DryRunHash32(Slice &s) {
+  // Same perf characteristics as GetSliceHash()
+  return BloomHash(s);
+}
+
+static uint32_t DryRunHash64(Slice &s) {
+  return Lower32of64(GetSliceHash64(s));
+}
+
+struct FilterBench : public MockBlockBasedTableTester {
+  std::vector<KeyMaker> kms_;
+  std::vector<FilterInfo> infos_;
+  Random32 random_;
+  std::ostringstream fp_rate_report_;
+  Arena arena_;
+
+  FilterBench()
+      : MockBlockBasedTableTester(new BloomFilterPolicy(
+            FLAGS_bits_per_key,
+            static_cast<BloomFilterPolicy::Mode>(FLAGS_impl))),
+        random_(FLAGS_seed) {
+    for (uint32_t i = 0; i < FLAGS_batch_size; ++i) {
+      kms_.emplace_back(FLAGS_key_size < 8 ? 8 : FLAGS_key_size);
+    }
+  }
+
+  void Go();
+
+  double RandomQueryTest(uint32_t inside_threshold, bool dry_run,
+                         TestMode mode);
+};
+
+void FilterBench::Go() {
+  if (FLAGS_use_plain_table_bloom && FLAGS_use_full_block_reader) {
+    throw std::runtime_error(
+        "Can't combine -use_plain_table_bloom and -use_full_block_reader");
+  }
+  if (FLAGS_use_plain_table_bloom) {
+    if (FLAGS_impl > 1) {
+      throw std::runtime_error(
+          "-impl must currently be >= 0 and <= 1 for Plain table");
+    }
+  } else {
+    if (FLAGS_impl == 1) {
+      throw std::runtime_error(
+          "Block-based filter not currently supported by filter_bench");
+    }
+    if (FLAGS_impl > 2) {
+      throw std::runtime_error(
+          "-impl must currently be 0 or 2 for Block-based table");
+    }
+  }
+
+  std::unique_ptr<FilterBitsBuilder> builder;
+  if (!FLAGS_use_plain_table_bloom && FLAGS_impl != 1) {
+    builder.reset(GetBuilder());
+  }
+
+  uint32_t variance_mask = 1;
+  while (variance_mask * variance_mask * 4 < FLAGS_average_keys_per_filter) {
+    variance_mask = variance_mask * 2 + 1;
+  }
+
+  const std::vector<TestMode> &testModes =
+      FLAGS_best_case ? bestCaseTestModes
+                      : FLAGS_quick ? quickTestModes : allTestModes;
+  if (FLAGS_quick) {
+    FLAGS_m_queries /= 7.0;
+  } else if (FLAGS_best_case) {
+    FLAGS_m_queries /= 3.0;
+    FLAGS_working_mem_size_mb /= 10.0;
+  }
+
+  std::cout << "Building..." << std::endl;
+
+  size_t total_memory_used = 0;
+  size_t total_keys_added = 0;
+
+  rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
+
+  while (total_memory_used < 1024 * 1024 * FLAGS_working_mem_size_mb) {
+    uint32_t filter_id = random_.Next();
+    uint32_t keys_to_add = FLAGS_average_keys_per_filter +
+                           (random_.Next() & variance_mask) -
+                           (variance_mask / 2);
+    infos_.emplace_back();
+    FilterInfo &info = infos_.back();
+    info.filter_id_ = filter_id;
+    info.keys_added_ = keys_to_add;
+    if (FLAGS_use_plain_table_bloom) {
+      info.plain_table_bloom_.reset(new PlainTableBloomV1());
+      info.plain_table_bloom_->SetTotalBits(
+          &arena_, keys_to_add * FLAGS_bits_per_key, FLAGS_impl,
+          0 /*huge_page*/, nullptr /*logger*/);
+      for (uint32_t i = 0; i < keys_to_add; ++i) {
+        uint32_t hash = GetSliceHash(kms_[0].Get(filter_id, i));
+        info.plain_table_bloom_->AddHash(hash);
+      }
+      info.filter_ = info.plain_table_bloom_->GetRawData();
+    } else {
+      for (uint32_t i = 0; i < keys_to_add; ++i) {
+        builder->AddKey(kms_[0].Get(filter_id, i));
+      }
+      info.filter_ = builder->Finish(&info.owner_);
+      info.reader_.reset(
+          table_options_.filter_policy->GetFilterBitsReader(info.filter_));
+      CachableEntry<ParsedFullFilterBlock> block(
+          new ParsedFullFilterBlock(table_options_.filter_policy.get(),
+                                    BlockContents(info.filter_)),
+          nullptr /* cache */, nullptr /* cache_handle */,
+          true /* own_value */);
+      info.full_block_reader_.reset(
+          new FullFilterBlockReader(table_.get(), std::move(block)));
+    }
+    total_memory_used += info.filter_.size();
+    total_keys_added += keys_to_add;
+  }
+
+  uint64_t elapsed_nanos = timer.ElapsedNanos();
+  double ns = double(elapsed_nanos) / total_keys_added;
+  std::cout << "Build avg ns/key: " << ns << std::endl;
+  std::cout << "Number of filters: " << infos_.size() << std::endl;
+  std::cout << "Total memory (MB): " << total_memory_used / 1024.0 / 1024.0
+            << std::endl;
+
+  double bpk = total_memory_used * 8.0 / total_keys_added;
+  std::cout << "Bits/key actual: " << bpk << std::endl;
+  if (!FLAGS_quick && !FLAGS_best_case) {
+    double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0));
+    std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk)
+              << std::endl;
+    std::cout << "Tolerable FP rate %: " << 100.0 * tolerable_rate << std::endl;
+
+    std::cout << "----------------------------" << std::endl;
+    std::cout << "Verifying..." << std::endl;
+
+    uint32_t outside_q_per_f =
+        static_cast<uint32_t>(FLAGS_m_queries * 1000000 / infos_.size());
+    uint64_t fps = 0;
+    for (uint32_t i = 0; i < infos_.size(); ++i) {
+      FilterInfo &info = infos_[i];
+      for (uint32_t j = 0; j < info.keys_added_; ++j) {
+        if (FLAGS_use_plain_table_bloom) {
+          uint32_t hash = GetSliceHash(kms_[0].Get(info.filter_id_, j));
+          ALWAYS_ASSERT(info.plain_table_bloom_->MayContainHash(hash));
+        } else {
+          ALWAYS_ASSERT(
+              info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j)));
+        }
+      }
+      for (uint32_t j = 0; j < outside_q_per_f; ++j) {
+        if (FLAGS_use_plain_table_bloom) {
+          uint32_t hash =
+              GetSliceHash(kms_[0].Get(info.filter_id_, j | 0x80000000));
+          fps += info.plain_table_bloom_->MayContainHash(hash);
+        } else {
+          fps += info.reader_->MayMatch(
+              kms_[0].Get(info.filter_id_, j | 0x80000000));
+        }
+      }
+    }
+    std::cout << " No FNs :)" << std::endl;
+    double prelim_rate = double(fps) / outside_q_per_f / infos_.size();
+    std::cout << " Prelim FP rate %: " << (100.0 * prelim_rate) << std::endl;
+
+    if (!FLAGS_allow_bad_fp_rate) {
+      ALWAYS_ASSERT(prelim_rate < tolerable_rate);
+    }
+  }
+
+  std::cout << "----------------------------" << std::endl;
+  std::cout << "Mixed inside/outside queries..." << std::endl;
+  // 50% each inside and outside
+  uint32_t inside_threshold = UINT32_MAX / 2;
+  for (TestMode tm : testModes) {
+    random_.Seed(FLAGS_seed + 1);
+    double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+    random_.Seed(FLAGS_seed + 1);
+    double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+    std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+              << std::endl;
+  }
+
+  if (!FLAGS_quick) {
+    std::cout << "----------------------------" << std::endl;
+    std::cout << "Inside queries (mostly)..." << std::endl;
+    // Do about 95% inside queries rather than 100% so that branch predictor
+    // can't give itself an artifically crazy advantage.
+    inside_threshold = UINT32_MAX / 20 * 19;
+    for (TestMode tm : testModes) {
+      random_.Seed(FLAGS_seed + 1);
+      double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+      random_.Seed(FLAGS_seed + 1);
+      double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+      std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+                << std::endl;
+    }
+
+    std::cout << "----------------------------" << std::endl;
+    std::cout << "Outside queries (mostly)..." << std::endl;
+    // Do about 95% outside queries rather than 100% so that branch predictor
+    // can't give itself an artifically crazy advantage.
+    inside_threshold = UINT32_MAX / 20;
+    for (TestMode tm : testModes) {
+      random_.Seed(FLAGS_seed + 2);
+      double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm);
+      random_.Seed(FLAGS_seed + 2);
+      double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm);
+      std::cout << "  " << TestModeToString(tm) << " net ns/op: " << (f - d)
+                << std::endl;
+    }
+  }
+  std::cout << fp_rate_report_.str();
+
+  std::cout << "----------------------------" << std::endl;
+  std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl;
+}
+
+double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
+                                    TestMode mode) {
+  for (auto &info : infos_) {
+    info.outside_queries_ = 0;
+    info.false_positives_ = 0;
+  }
+
+  auto dry_run_hash_fn = DryRunNoHash;
+  if (!FLAGS_net_includes_hashing) {
+    if (FLAGS_impl < 2 || FLAGS_use_plain_table_bloom) {
+      dry_run_hash_fn = DryRunHash32;
+    } else {
+      dry_run_hash_fn = DryRunHash64;
+    }
+  }
+
+  uint32_t num_infos = static_cast<uint32_t>(infos_.size());
+  uint32_t dry_run_hash = 0;
+  uint64_t max_queries =
+      static_cast<uint64_t>(FLAGS_m_queries * 1000000 + 0.50);
+  // Some filters may be considered secondary in order to implement skewed
+  // queries. num_primary_filters is the number that are to be treated as
+  // equal, and any remainder will be treated as secondary.
+  uint32_t num_primary_filters = num_infos;
+  // The proportion (when divided by 2^32 - 1) of filter queries going to
+  // the primary filters (default = all). The remainder of queries are
+  // against secondary filters.
+  uint32_t primary_filter_threshold = 0xffffffff;
+  if (mode == kSingleFilter) {
+    // 100% of queries to 1 filter
+    num_primary_filters = 1;
+  } else if (mode == kFiftyOneFilter) {
+    // 50% of queries
+    primary_filter_threshold /= 2;
+    // to 1% of filters
+    num_primary_filters = (num_primary_filters + 99) / 100;
+  } else if (mode == kEightyTwentyFilter) {
+    // 80% of queries
+    primary_filter_threshold = primary_filter_threshold / 5 * 4;
+    // to 20% of filters
+    num_primary_filters = (num_primary_filters + 4) / 5;
+  }
+  uint32_t batch_size = 1;
+  std::unique_ptr<Slice[]> batch_slices;
+  std::unique_ptr<Slice *[]> batch_slice_ptrs;
+  std::unique_ptr<bool[]> batch_results;
+  if (mode == kBatchPrepared || mode == kBatchUnprepared) {
+    batch_size = static_cast<uint32_t>(kms_.size());
+  }
+
+  batch_slices.reset(new Slice[batch_size]);
+  batch_slice_ptrs.reset(new Slice *[batch_size]);
+  batch_results.reset(new bool[batch_size]);
+  for (uint32_t i = 0; i < batch_size; ++i) {
+    batch_results[i] = false;
+    batch_slice_ptrs[i] = &batch_slices[i];
+  }
+
+  rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
+
+  for (uint64_t q = 0; q < max_queries; q += batch_size) {
+    bool inside_this_time = random_.Next() <= inside_threshold;
+
+    uint32_t filter_index;
+    if (random_.Next() <= primary_filter_threshold) {
+      filter_index = random_.Uniformish(num_primary_filters);
+    } else {
+      // secondary
+      filter_index = num_primary_filters +
+                     random_.Uniformish(num_infos - num_primary_filters);
+    }
+    FilterInfo &info = infos_[filter_index];
+    for (uint32_t i = 0; i < batch_size; ++i) {
+      if (inside_this_time) {
+        batch_slices[i] =
+            kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_));
+      } else {
+        batch_slices[i] =
+            kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_) |
+                                             uint32_t{0x80000000});
+        info.outside_queries_++;
+      }
+    }
+    // TODO: implement batched interface to full block reader
+    // TODO: implement batched interface to plain table bloom
+    if (mode == kBatchPrepared && !FLAGS_use_full_block_reader &&
+        !FLAGS_use_plain_table_bloom) {
+      for (uint32_t i = 0; i < batch_size; ++i) {
+        batch_results[i] = false;
+      }
+      if (dry_run) {
+        for (uint32_t i = 0; i < batch_size; ++i) {
+          batch_results[i] = true;
+          dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+        }
+      } else {
+        info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(),
+                               batch_results.get());
+      }
+      for (uint32_t i = 0; i < batch_size; ++i) {
+        if (inside_this_time) {
+          ALWAYS_ASSERT(batch_results[i]);
+        } else {
+          info.false_positives_ += batch_results[i];
+        }
+      }
+    } else {
+      for (uint32_t i = 0; i < batch_size; ++i) {
+        bool may_match;
+        if (FLAGS_use_plain_table_bloom) {
+          if (dry_run) {
+            dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+            may_match = true;
+          } else {
+            uint32_t hash = GetSliceHash(batch_slices[i]);
+            may_match = info.plain_table_bloom_->MayContainHash(hash);
+          }
+        } else if (FLAGS_use_full_block_reader) {
+          if (dry_run) {
+            dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+            may_match = true;
+          } else {
+            may_match = info.full_block_reader_->KeyMayMatch(
+                batch_slices[i],
+                /*prefix_extractor=*/nullptr,
+                /*block_offset=*/rocksdb::kNotValid,
+                /*no_io=*/false, /*const_ikey_ptr=*/nullptr,
+                /*get_context=*/nullptr,
+                /*lookup_context=*/nullptr);
+          }
+        } else {
+          if (dry_run) {
+            dry_run_hash += dry_run_hash_fn(batch_slices[i]);
+            may_match = true;
+          } else {
+            may_match = info.reader_->MayMatch(batch_slices[i]);
+          }
+        }
+        if (inside_this_time) {
+          ALWAYS_ASSERT(may_match);
+        } else {
+          info.false_positives_ += may_match;
+        }
+      }
+    }
+  }
+
+  uint64_t elapsed_nanos = timer.ElapsedNanos();
+  double ns = double(elapsed_nanos) / max_queries;
+
+  if (!FLAGS_quick) {
+    if (dry_run) {
+      // Printing part of hash prevents dry run components from being optimized
+      // away by compiler
+      std::cout << "    Dry run (" << std::hex << (dry_run_hash & 0xfffff)
+                << std::dec << ") ";
+    } else {
+      std::cout << "    Gross filter    ";
+    }
+    std::cout << "ns/op: " << ns << std::endl;
+  }
+
+  if (!dry_run) {
+    fp_rate_report_.str("");
+    uint64_t q = 0;
+    uint64_t fp = 0;
+    double worst_fp_rate = 0.0;
+    double best_fp_rate = 1.0;
+    for (auto &info : infos_) {
+      q += info.outside_queries_;
+      fp += info.false_positives_;
+      if (info.outside_queries_ > 0) {
+        double fp_rate = double(info.false_positives_) / info.outside_queries_;
+        worst_fp_rate = std::max(worst_fp_rate, fp_rate);
+        best_fp_rate = std::min(best_fp_rate, fp_rate);
+      }
+    }
+    fp_rate_report_ << "    Average FP rate %: " << 100.0 * fp / q << std::endl;
+    if (!FLAGS_quick && !FLAGS_best_case) {
+      fp_rate_report_ << "    Worst   FP rate %: " << 100.0 * worst_fp_rate
+                      << std::endl;
+      fp_rate_report_ << "    Best    FP rate %: " << 100.0 * best_fp_rate
+                      << std::endl;
+      fp_rate_report_ << "    Best possible bits/key: "
+                      << -std::log(double(fp) / q) / std::log(2.0) << std::endl;
+    }
+  }
+  return ns;
+}
+
+int main(int argc, char **argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [-quick] [OTHER OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  PrintWarnings();
+
+  if (FLAGS_legend) {
+    std::cout
+        << "Legend:" << std::endl
+        << "  \"Inside\" - key that was added to filter" << std::endl
+        << "  \"Outside\" - key that was not added to filter" << std::endl
+        << "  \"FN\" - false negative query (must not happen)" << std::endl
+        << "  \"FP\" - false positive query (OK at low rate)" << std::endl
+        << "  \"Dry run\" - cost of testing and hashing overhead." << std::endl
+        << "  \"Gross filter\" - cost of filter queries including testing "
+        << "\n     and hashing overhead." << std::endl
+        << "  \"net\" - best estimate of time in filter operation, without "
+        << "\n     testing and hashing overhead (gross filter - dry run)"
+        << std::endl
+        << "  \"ns/op\" - nanoseconds per operation (key query or add)"
+        << std::endl
+        << "  \"Single filter\" - essentially minimum cost, assuming filter"
+        << "\n     fits easily in L1 CPU cache." << std::endl
+        << "  \"Batched, prepared\" - several queries at once against a"
+        << "\n     randomly chosen filter, using multi-query interface."
+        << std::endl
+        << "  \"Batched, unprepared\" - similar, but using serial calls"
+        << "\n     to single query interface." << std::endl
+        << "  \"Random filter\" - a filter is chosen at random as target"
+        << "\n     of each query." << std::endl
+        << "  \"Skewed X% in Y%\" - like \"Random filter\" except Y% of"
+        << "\n      the filters are designated as \"hot\" and receive X%"
+        << "\n      of queries." << std::endl;
+  } else {
+    FilterBench b;
+    b.Go();
+  }
+
+  return 0;
+}
+
+#endif  // !defined(GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/util/gflags_compat.h b/util/gflags_compat.h
index 0ea3aef5e94..d5a30ce7e5d 100644
--- a/util/gflags_compat.h
+++ b/util/gflags_compat.h
@@ -10,3 +10,10 @@
 // still google by default.
 #define GFLAGS_NAMESPACE google
 #endif
+
+#ifndef DEFINE_uint32
+// DEFINE_uint32 does not appear in older versions of gflags. This should be
+// a sane definition for those versions.
+#define DEFINE_uint32(name, val, txt) \
+  DEFINE_VARIABLE(GFLAGS_NAMESPACE::uint32, U, name, val, txt)
+#endif
diff --git a/util/hash.cc b/util/hash.cc
index 852710d73fa..7daffb251d4 100644
--- a/util/hash.cc
+++ b/util/hash.cc
@@ -11,11 +11,14 @@
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/util.h"
+#include "util/xxhash.h"
 
 namespace rocksdb {
 
 uint32_t Hash(const char* data, size_t n, uint32_t seed) {
-  // Similar to murmur hash
+  // MurmurHash1 - fast but mediocre quality
+  // https://github.com/aappleby/smhasher/wiki/MurmurHash1
+  //
   const uint32_t m = 0xc6a4a793;
   const uint32_t r = 24;
   const char* limit = data + n;
@@ -54,4 +57,27 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
   return h;
 }
 
+// We are standardizing on a preview release of XXH3, because that's
+// the best available at time of standardizing.
+//
+// In testing (mostly Intel Skylake), this hash function is much more
+// thorough than Hash32 and is almost universally faster. Hash() only
+// seems faster when passing runtime-sized keys of the same small size
+// (less than about 24 bytes) thousands of times in a row; this seems
+// to allow the branch predictor to work some magic. XXH3's speed is
+// much less dependent on branch prediction.
+//
+// Hashing with a prefix extractor is potentially a common case of
+// hashing objects of small, predictable size. We could consider
+// bundling hash functions specialized for particular lengths with
+// the prefix extractors.
+uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
+  return XXH3p_64bits_withSeed(data, n, seed);
+}
+
+uint64_t Hash64(const char* data, size_t n) {
+  // Same as seed = 0
+  return XXH3p_64bits(data, n);
+}
+
 }  // namespace rocksdb
diff --git a/util/hash.h b/util/hash.h
index ed42b08942e..888773c0ee3 100644
--- a/util/hash.h
+++ b/util/hash.h
@@ -7,46 +7,114 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 //
-// Simple hash function used for internal data structures
+// Common hash functions with convenient interfaces. If hashing a
+// statically-sized input in a performance-critical context, consider
+// calling a specific hash implementation directly, such as
+// XXH3p_64bits from xxhash.h.
+//
+// Since this is a very common header, implementation details are kept
+// out-of-line. Out-of-lining also aids in tracking the time spent in
+// hashing functions. Inlining is of limited benefit for runtime-sized
+// hash inputs.
 
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
 
 #include "rocksdb/slice.h"
-#include "util/murmurhash.h"
 
 namespace rocksdb {
 
-// Non-persistent hash. Only used for in-memory data structure
-// The hash results are applicable to change.
-extern uint64_t NPHash64(const char* data, size_t n, uint32_t seed);
+// Stable/persistent 64-bit hash. Higher quality and generally faster than
+// Hash(), especially for inputs > 24 bytes.
+extern uint64_t Hash64(const char* data, size_t n, uint64_t seed);
+
+// Specific optimization without seed (same as seed = 0)
+extern uint64_t Hash64(const char* data, size_t n);
+
+// Non-persistent hash. Must only used for in-memory data structure.
+// The hash results are thus applicable to change. (Thus, it rarely makes
+// sense to specify a seed for this function.)
+inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
+  // Currently same as Hash64
+  return Hash64(data, n, seed);
+}
+
+// Specific optimization without seed (same as seed = 0)
+inline uint64_t NPHash64(const char* data, size_t n) {
+  // Currently same as Hash64
+  return Hash64(data, n);
+}
 
+// Stable/persistent 32-bit hash. Moderate quality and high speed on
+// small inputs.
+// TODO: consider rename to Hash32
 extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
 
+// TODO: consider rename to LegacyBloomHash32
 inline uint32_t BloomHash(const Slice& key) {
   return Hash(key.data(), key.size(), 0xbc9f1d34);
 }
 
+inline uint64_t GetSliceHash64(const Slice& key) {
+  return Hash64(key.data(), key.size());
+}
+
 inline uint64_t GetSliceNPHash64(const Slice& s) {
-  return NPHash64(s.data(), s.size(), 0);
+  return NPHash64(s.data(), s.size());
 }
 
+// TODO: consider rename to GetSliceHash32
 inline uint32_t GetSliceHash(const Slice& s) {
   return Hash(s.data(), s.size(), 397);
 }
 
-inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
-  // Right now murmurhash2B is used. It should able to be freely
-  // changed to a better hash, without worrying about backward
-  // compatibility issue.
-  return MURMUR_HASH(data, static_cast<int>(n),
-                     static_cast<unsigned int>(seed));
+// Useful for splitting up a 64-bit hash
+inline uint32_t Upper32of64(uint64_t v) {
+  return static_cast<uint32_t>(v >> 32);
 }
+inline uint32_t Lower32of64(uint64_t v) { return static_cast<uint32_t>(v); }
 
 // std::hash compatible interface.
+// TODO: consider rename to SliceHasher32
 struct SliceHasher {
   uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
 };
 
+// An alternative to % for mapping a hash value to an arbitrary range. See
+// https://github.com/lemire/fastrange
+inline uint32_t fastrange32(uint32_t hash, uint32_t range) {
+  uint64_t product = uint64_t{range} * hash;
+  return static_cast<uint32_t>(product >> 32);
+}
+
+// An alternative to % for mapping a 64-bit hash value to an arbitrary range
+// that fits in size_t. See https://github.com/lemire/fastrange
+// We find size_t more convenient than uint64_t for the range, with side
+// benefit of better optimization on 32-bit platforms.
+inline size_t fastrange64(uint64_t hash, size_t range) {
+#if defined(HAVE_UINT128_EXTENSION)
+  // Can use compiler's 128-bit type. Trust it to do the right thing.
+  __uint128_t wide = __uint128_t{range} * hash;
+  return static_cast<size_t>(wide >> 64);
+#else
+  // Fall back: full decomposition.
+  // NOTE: GCC seems to fully understand this code as 64-bit x {32 or 64}-bit
+  // -> {96 or 128}-bit multiplication and optimize it down to a single
+  // wide-result multiplication (64-bit platform) or two wide-result
+  // multiplications (32-bit platforms, where range64 >> 32 is zero).
+  uint64_t range64 = range;  // ok to shift by 32, even if size_t is 32-bit
+  uint64_t tmp = uint64_t{range64 & 0xffffFFFF} * uint64_t{hash & 0xffffFFFF};
+  tmp >>= 32;
+  tmp += uint64_t{range64 & 0xffffFFFF} * uint64_t{hash >> 32};
+  // Avoid overflow: first add lower 32 of tmp2, and later upper 32
+  uint64_t tmp2 = uint64_t{range64 >> 32} * uint64_t{hash & 0xffffFFFF};
+  tmp += static_cast<uint32_t>(tmp2);
+  tmp >>= 32;
+  tmp += (tmp2 >> 32);
+  tmp += uint64_t{range64 >> 32} * uint64_t{hash >> 32};
+  return static_cast<size_t>(tmp);
+#endif
+}
+
 }  // namespace rocksdb
diff --git a/util/hash_test.cc b/util/hash_test.cc
index 959e8cd0f68..81fcf6bf130 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -7,63 +7,367 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <cstring>
 #include <vector>
 
+#include "test_util/testharness.h"
+#include "util/coding.h"
 #include "util/hash.h"
-#include "util/testharness.h"
+
+using rocksdb::EncodeFixed32;
+using rocksdb::GetSliceHash64;
+using rocksdb::Hash;
+using rocksdb::Hash64;
+using rocksdb::Lower32of64;
+using rocksdb::Upper32of64;
+using rocksdb::Slice;
 
 // The hash algorithm is part of the file format, for example for the Bloom
 // filters. Test that the hash values are stable for a set of random strings of
 // varying lengths.
 TEST(HashTest, Values) {
-  using rocksdb::Hash;
   constexpr uint32_t kSeed = 0xbc9f1d34;  // Same as BloomHash.
 
-  EXPECT_EQ(Hash("", 0, kSeed), 3164544308);
-  EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524);
-  EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998);
-  EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349);
-  EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383);
-  EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956);
-  EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105);
-  EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776);
-  EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603);
-  EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797);
-  EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265);
-  EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661);
-  EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222);
-  EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265);
-  EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611);
-  EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572);
-  EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355);
-  EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370);
-  EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402);
-  EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743);
-  EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500);
-  EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008);
-  EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482);
-  EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422);
-  EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029);
-  EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748);
-  EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698);
-  EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227);
-  EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252);
-  EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402);
-  EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698);
-  EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809);
-  EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120);
-  EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), 2706087434);
-  EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), 1534654151);
-  EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), 2355554696);
-  EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), 1400800912);
+  EXPECT_EQ(Hash("", 0, kSeed), 3164544308u);
+  EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524u);
+  EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998u);
+  EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349u);
+  EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383u);
+  EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956u);
+  EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105u);
+  EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776u);
+  EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603u);
+  EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797u);
+  EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265u);
+  EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661u);
+  EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222u);
+  EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265u);
+  EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611u);
+  EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572u);
+  EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355u);
+  EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370u);
+  EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402u);
+  EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743u);
+  EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500u);
+  EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008u);
+  EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482u);
+  EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422u);
+  EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029u);
+  EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748u);
+  EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698u);
+  EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227u);
+  EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252u);
+  EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402u);
+  EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698u);
+  EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809u);
+  EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120u);
+  EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed),
+            2706087434u);
+  EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed),
+            1534654151u);
+  EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed),
+            2355554696u);
+  EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed),
+            1400800912u);
   EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
-            3420325137);
+            3420325137u);
   EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
-            3427803584);
+            3427803584u);
   EXPECT_EQ(Hash("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
-            1152407945);
+            1152407945u);
   EXPECT_EQ(Hash("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
-            3382479516);
+            3382479516u);
+}
+
+// The hash algorithm is part of the file format, for example for the Bloom
+// filters.
+TEST(HashTest, Hash64Misc) {
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash64
+
+  for (char fill : {'\0', 'a', '1', '\xff'}) {
+    const size_t max_size = 1000;
+    const std::string str(max_size, fill);
+
+    for (size_t size = 0; size <= max_size; ++size) {
+      uint64_t here = Hash64(str.data(), size, kSeed);
+
+      // Must be same as GetSliceHash64
+      EXPECT_EQ(here, GetSliceHash64(Slice(str.data(), size)));
+
+      // Upper and Lower must reconstruct hash
+      EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) | Lower32of64(here));
+      EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) + Lower32of64(here));
+      EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) ^ Lower32of64(here));
+
+      // Seed changes hash value (with high probability)
+      for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) {
+        EXPECT_NE(here, Hash64(str.data(), size, var_seed));
+      }
+
+      // Size changes hash value (with high probability)
+      size_t max_smaller_by = std::min(size_t{30}, size);
+      for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) {
+        EXPECT_NE(here, Hash64(str.data(), size - smaller_by, kSeed));
+      }
+    }
+  }
+}
+
+// Test that hash values are "non-trivial" for "trivial" inputs
+TEST(HashTest, Hash64Trivial) {
+  // Thorough test too slow for regression testing
+  constexpr bool thorough = false;
+
+  // For various seeds, make sure hash of empty string is not zero.
+  constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000;
+  for (uint64_t seed = 0; seed < max_seed; ++seed) {
+    uint64_t here = Hash64("", 0, seed);
+    EXPECT_NE(Lower32of64(here), 0u);
+    EXPECT_NE(Upper32of64(here), 0u);
+  }
+
+  // For standard seed, make sure hash of small strings are not zero
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash64
+  char input[4];
+  constexpr int max_len = thorough ? 3 : 2;
+  for (int len = 1; len <= max_len; ++len) {
+    for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) {
+      EncodeFixed32(input, i);
+      uint64_t here = Hash64(input, len, kSeed);
+      EXPECT_NE(Lower32of64(here), 0u);
+      EXPECT_NE(Upper32of64(here), 0u);
+    }
+  }
+}
+
+// Test that the hash values are stable for a set of random strings of
+// varying small lengths.
+TEST(HashTest, Hash64SmallValueSchema) {
+  constexpr uint32_t kSeed = 0;  // Same as GetSliceHash64
+
+  EXPECT_EQ(Hash64("", 0, kSeed), uint64_t{5999572062939766020u});
+  EXPECT_EQ(Hash64("\x08", 1, kSeed), uint64_t{583283813901344696u});
+  EXPECT_EQ(Hash64("\x17", 1, kSeed), uint64_t{16175549975585474943u});
+  EXPECT_EQ(Hash64("\x9a", 1, kSeed), uint64_t{16322991629225003903u});
+  EXPECT_EQ(Hash64("\x1c", 1, kSeed), uint64_t{13269285487706833447u});
+  EXPECT_EQ(Hash64("\x4d\x76", 2, kSeed), uint64_t{6859542833406258115u});
+  EXPECT_EQ(Hash64("\x52\xd5", 2, kSeed), uint64_t{4919611532550636959u});
+  EXPECT_EQ(Hash64("\x91\xf7", 2, kSeed), uint64_t{14199427467559720719u});
+  EXPECT_EQ(Hash64("\xd6\x27", 2, kSeed), uint64_t{12292689282614532691u});
+  EXPECT_EQ(Hash64("\x30\x46\x0b", 3, kSeed), uint64_t{11404699285340020889u});
+  EXPECT_EQ(Hash64("\x56\xdc\xd6", 3, kSeed), uint64_t{12404347133785524237u});
+  EXPECT_EQ(Hash64("\xd4\x52\x33", 3, kSeed), uint64_t{15853805298481534034u});
+  EXPECT_EQ(Hash64("\x6a\xb5\xf4", 3, kSeed), uint64_t{16863488758399383382u});
+  EXPECT_EQ(Hash64("\x67\x53\x81\x1c", 4, kSeed),
+            uint64_t{9010661983527562386u});
+  EXPECT_EQ(Hash64("\x69\xb8\xc0\x88", 4, kSeed),
+            uint64_t{6611781377647041447u});
+  EXPECT_EQ(Hash64("\x1e\x84\xaf\x2d", 4, kSeed),
+            uint64_t{15290969111616346501u});
+  EXPECT_EQ(Hash64("\x46\xdc\x54\xbe", 4, kSeed),
+            uint64_t{7063754590279313623u});
+  EXPECT_EQ(Hash64("\xd0\x7a\x6e\xea\x56", 5, kSeed),
+            uint64_t{6384167718754869899u});
+  EXPECT_EQ(Hash64("\x86\x83\xd5\xa4\xd8", 5, kSeed),
+            uint64_t{16874407254108011067u});
+  EXPECT_EQ(Hash64("\xb7\x46\xbb\x77\xce", 5, kSeed),
+            uint64_t{16809880630149135206u});
+  EXPECT_EQ(Hash64("\x6c\xa8\xbc\xe5\x99", 5, kSeed),
+            uint64_t{1249038833153141148u});
+  EXPECT_EQ(Hash64("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed),
+            uint64_t{17358142495308219330u});
+  EXPECT_EQ(Hash64("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed),
+            uint64_t{4237646583134806322u});
+  EXPECT_EQ(Hash64("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed),
+            uint64_t{4373664924115234051u});
+  EXPECT_EQ(Hash64("\x73\xe1\xff\x56\x9c\xce", 6, kSeed),
+            uint64_t{12012981210634596029u});
+  EXPECT_EQ(Hash64("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed),
+            uint64_t{5716522398211028826u});
+  EXPECT_EQ(Hash64("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed),
+            uint64_t{15604531309862565013u});
+  EXPECT_EQ(Hash64("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed),
+            uint64_t{8601330687345614172u});
+  EXPECT_EQ(Hash64("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed),
+            uint64_t{8088079329364056942u});
+  EXPECT_EQ(Hash64("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed),
+            uint64_t{9844314944338447628u});
+  EXPECT_EQ(Hash64("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed),
+            uint64_t{10973293517982163143u});
+  EXPECT_EQ(Hash64("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed),
+            uint64_t{9986007080564743219u});
+  EXPECT_EQ(Hash64("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed),
+            uint64_t{1729303145008254458u});
+  EXPECT_EQ(Hash64("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed),
+            uint64_t{13253403748084181481u});
+  EXPECT_EQ(Hash64("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed),
+            uint64_t{7768754303876232188u});
+  EXPECT_EQ(Hash64("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed),
+            uint64_t{12439346786701492u});
+  EXPECT_EQ(Hash64("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed),
+            uint64_t{10841838338450144690u});
+  EXPECT_EQ(Hash64("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed),
+            uint64_t{12883919702069153152u});
+  EXPECT_EQ(Hash64("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed),
+            uint64_t{12692903507676842188u});
+  EXPECT_EQ(Hash64("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed),
+            uint64_t{6540985900674032620u});
+  EXPECT_EQ(Hash64("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed),
+            uint64_t{10551812464348219044u});
+}
+
+std::string Hash64TestDescriptor(const char *repeat, size_t limit) {
+  const char *mod61_encode =
+      "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+  std::string input;
+  while (input.size() < limit) {
+    input.append(repeat);
+  }
+  std::string rv;
+  for (size_t i = 0; i < limit; ++i) {
+    uint64_t h = GetSliceHash64(Slice(input.data(), i));
+    rv.append(1, mod61_encode[static_cast<size_t>(h % 61)]);
+  }
+  return rv;
+}
+
+// XXH3p changes its algorithm for various sizes up through 250 bytes, so
+// we need to check the stability of larger sizes also.
+TEST(HashTest, Hash64LargeValueSchema) {
+  // Each of these derives a "descriptor" from the hash values for all
+  // lengths up to 430.
+  // Note that "c" is common for the zero-length string.
+  EXPECT_EQ(
+      Hash64TestDescriptor("foo", 430),
+      "cRhyWsY67B6klRA1udmOuiYuX7IthyGBKqbeosz2hzVglWCmQx8nEdnpkvPfYX56Up2OWOTV"
+      "lTzfAoYwvtqKzjD8E9xttR2unelbXbIV67NUe6bOO23BxaSFRcA3njGu5cUWfgwOqNoTsszp"
+      "uPvKRP6qaUR5VdoBkJUCFIefd7edlNK5mv6JYWaGdwxehg65hTkTmjZoPKxTZo4PLyzbL9U4"
+      "xt12ITSfeP2MfBHuLI2z2pDlBb44UQKVMx27LEoAHsdLp3WfWfgH3sdRBRCHm33UxCM4QmE2"
+      "xJ7gqSvNwTeH7v9GlC8zWbGroyD3UVNeShMLx29O7tH1biemLULwAHyIw8zdtLMDpEJ8m2ic"
+      "l6Lb4fDuuFNAs1GCVUthjK8CV8SWI8Rsz5THSwn5CGhpqUwSZcFknjwWIl5rNCvDxXJqYr");
+  // Note that "1EeRk" is common for "Rocks"
+  EXPECT_EQ(
+      Hash64TestDescriptor("Rocks", 430),
+      "c1EeRkrzgOYWLA8PuhJrwTePJewoB44WdXYDfhbk3ZxTqqg25WlPExDl7IKIQLJvnA6gJxxn"
+      "9TCSLkFGfJeXehaSS1GBqWSzfhEH4VXiXIUCuxJXxtKXcSC6FrNIQGTZbYDiUOLD6Y5inzrF"
+      "9etwQhXUBanw55xAUdNMFQAm2GjJ6UDWp2mISLiMMkLjANWMKLaZMqaFLX37qB4MRO1ooVRv"
+      "zSvaNRSCLxlggQCasQq8icWjzf3HjBlZtU6pd4rkaUxSzHqmo9oM5MghbU5Rtxg8wEfO7lVN"
+      "5wdMONYecslQTwjZUpO1K3LDf3K3XK6sUXM6ShQQ3RHmMn2acB4YtTZ3QQcHYJSOHn2DuWpa"
+      "Q8RqzX5lab92YmOLaCdOHq1BPsM7SIBzMdLgePNsJ1vvMALxAaoDUHPxoFLO2wx18IXnyX");
+  EXPECT_EQ(
+      Hash64TestDescriptor("RocksDB", 430),
+      "c1EeRkukbkb28wLTahwD2sfUhZzaBEnF8SVrxnPVB6A7b8CaAl3UKsDZISF92GSq2wDCukOq"
+      "Jgrsp7A3KZhDiLW8dFXp8UPqPxMCRlMdZeVeJ2dJxrmA6cyt99zkQFj7ELbut6jAeVqARFnw"
+      "fnWVXOsaLrq7bDCbMcns2DKvTaaqTCLMYxI7nhtLpFN1jR755FRQFcOzrrDbh7QhypjdvlYw"
+      "cdAMSZgp9JMHxbM23wPSuH6BOFgxejz35PScZfhDPvTOxIy1jc3MZsWrMC3P324zNolO7JdW"
+      "CX2I5UDKjjaEJfxbgVgJIXxtQGlmj2xkO5sPpjULQV4X2HlY7FQleJ4QRaJIB4buhCA4vUTF"
+      "eMFlxCIYUpTCsal2qsmnGOWa8WCcefrohMjDj1fjzSvSaQwlpyR1GZHF2uPOoQagiCpHpm");
+}
+
+TEST(Fastrange32Test, Values) {
+  using rocksdb::fastrange32;
+  // Zero range
+  EXPECT_EQ(fastrange32(0, 0), 0U);
+  EXPECT_EQ(fastrange32(123, 0), 0U);
+  EXPECT_EQ(fastrange32(0xffffffff, 0), 0U);
+
+  // One range
+  EXPECT_EQ(fastrange32(0, 1), 0U);
+  EXPECT_EQ(fastrange32(123, 1), 0U);
+  EXPECT_EQ(fastrange32(0xffffffff, 1), 0U);
+
+  // Two range
+  EXPECT_EQ(fastrange32(0, 2), 0U);
+  EXPECT_EQ(fastrange32(123, 2), 0U);
+  EXPECT_EQ(fastrange32(0x7fffffff, 2), 0U);
+  EXPECT_EQ(fastrange32(0x80000000, 2), 1U);
+  EXPECT_EQ(fastrange32(0xffffffff, 2), 1U);
+
+  // Seven range
+  EXPECT_EQ(fastrange32(0, 7), 0U);
+  EXPECT_EQ(fastrange32(123, 7), 0U);
+  EXPECT_EQ(fastrange32(613566756, 7), 0U);
+  EXPECT_EQ(fastrange32(613566757, 7), 1U);
+  EXPECT_EQ(fastrange32(1227133513, 7), 1U);
+  EXPECT_EQ(fastrange32(1227133514, 7), 2U);
+  // etc.
+  EXPECT_EQ(fastrange32(0xffffffff, 7), 6U);
+
+  // Big
+  EXPECT_EQ(fastrange32(1, 0x80000000), 0U);
+  EXPECT_EQ(fastrange32(2, 0x80000000), 1U);
+  EXPECT_EQ(fastrange32(4, 0x7fffffff), 1U);
+  EXPECT_EQ(fastrange32(4, 0x80000000), 2U);
+  EXPECT_EQ(fastrange32(0xffffffff, 0x7fffffff), 0x7ffffffeU);
+  EXPECT_EQ(fastrange32(0xffffffff, 0x80000000), 0x7fffffffU);
+}
+
+TEST(Fastrange64Test, Values) {
+  using rocksdb::fastrange64;
+  // Zero range
+  EXPECT_EQ(fastrange64(0, 0), 0U);
+  EXPECT_EQ(fastrange64(123, 0), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFF, 0), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0), 0U);
+
+  // One range
+  EXPECT_EQ(fastrange64(0, 1), 0U);
+  EXPECT_EQ(fastrange64(123, 1), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFF, 1), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 1), 0U);
+
+  // Two range
+  EXPECT_EQ(fastrange64(0, 2), 0U);
+  EXPECT_EQ(fastrange64(123, 2), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFF, 2), 0U);
+  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 2), 0U);
+  EXPECT_EQ(fastrange64(0x8000000000000000, 2), 1U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 2), 1U);
+
+  // Seven range
+  EXPECT_EQ(fastrange64(0, 7), 0U);
+  EXPECT_EQ(fastrange64(123, 7), 0U);
+  EXPECT_EQ(fastrange64(0xffffFFFF, 7), 0U);
+  EXPECT_EQ(fastrange64(2635249153387078802, 7), 0U);
+  EXPECT_EQ(fastrange64(2635249153387078803, 7), 1U);
+  EXPECT_EQ(fastrange64(5270498306774157604, 7), 1U);
+  EXPECT_EQ(fastrange64(5270498306774157605, 7), 2U);
+  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 7), 3U);
+  EXPECT_EQ(fastrange64(0x8000000000000000, 7), 3U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 7), 6U);
+
+  // Big but 32-bit range
+  EXPECT_EQ(fastrange64(0x100000000, 0x80000000), 0U);
+  EXPECT_EQ(fastrange64(0x200000000, 0x80000000), 1U);
+  EXPECT_EQ(fastrange64(0x400000000, 0x7fffFFFF), 1U);
+  EXPECT_EQ(fastrange64(0x400000000, 0x80000000), 2U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x7fffFFFF), 0x7fffFFFEU);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0x80000000), 0x7fffFFFFU);
+
+  // Big, > 32-bit range
+#if SIZE_MAX == UINT64_MAX
+  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 0x4200000002), 0x2100000000U);
+  EXPECT_EQ(fastrange64(0x8000000000000000, 0x4200000002), 0x2100000001U);
+
+  EXPECT_EQ(fastrange64(0x0000000000000000, 420000000002), 0U);
+  EXPECT_EQ(fastrange64(0x7fffFFFFffffFFFF, 420000000002), 210000000000U);
+  EXPECT_EQ(fastrange64(0x8000000000000000, 420000000002), 210000000001U);
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 420000000002), 420000000001U);
+
+  EXPECT_EQ(fastrange64(0xffffFFFFffffFFFF, 0xffffFFFFffffFFFF),
+            0xffffFFFFffffFFFEU);
+#endif
+}
+
+// for inspection of disassembly
+uint32_t fastrange32(uint32_t hash, uint32_t range) {
+  return rocksdb::fastrange32(hash, range);
+}
+
+// for inspection of disassembly
+size_t fastrange64(uint64_t hash, size_t range) {
+  return rocksdb::fastrange64(hash, range);
 }
 
 int main(int argc, char** argv) {
diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc
index 5c9b3e84bf4..0b5eee00a24 100644
--- a/util/log_write_bench.cc
+++ b/util/log_write_bench.cc
@@ -11,12 +11,12 @@ int main() {
 }
 #else
 
+#include "file/writable_file_writer.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/env.h"
-#include "util/file_reader_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/gflags_compat.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
@@ -32,13 +32,16 @@ DEFINE_bool(enable_sync, false, "sync after each write.");
 namespace rocksdb {
 void RunBenchmark() {
   std::string file_name = test::PerThreadDBPath("log_write_benchmark.log");
+  DBOptions options;
   Env* env = Env::Default();
-  EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions());
+  EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions(), options);
   env_options.bytes_per_sync = FLAGS_bytes_per_sync;
   std::unique_ptr<WritableFile> file;
   env->NewWritableFile(file_name, &file, env_options);
   std::unique_ptr<WritableFileWriter> writer;
-  writer.reset(new WritableFileWriter(std::move(file), env_options));
+  writer.reset(new WritableFileWriter(std::move(file), file_name, env_options,
+                                      env, nullptr /* stats */,
+                                      options.listeners));
 
   std::string record;
   record.assign(FLAGS_record_size, 'X');
diff --git a/util/mutexlock.h b/util/mutexlock.h
index 640cef3daf7..7516683a77a 100644
--- a/util/mutexlock.h
+++ b/util/mutexlock.h
@@ -31,13 +31,14 @@ class MutexLock {
   explicit MutexLock(port::Mutex *mu) : mu_(mu) {
     this->mu_->Lock();
   }
+  // No copying allowed
+  MutexLock(const MutexLock &) = delete;
+  void operator=(const MutexLock &) = delete;
+
   ~MutexLock() { this->mu_->Unlock(); }
 
  private:
   port::Mutex *const mu_;
-  // No copying allowed
-  MutexLock(const MutexLock&);
-  void operator=(const MutexLock&);
 };
 
 //
@@ -50,13 +51,14 @@ class ReadLock {
   explicit ReadLock(port::RWMutex *mu) : mu_(mu) {
     this->mu_->ReadLock();
   }
+  // No copying allowed
+  ReadLock(const ReadLock &) = delete;
+  void operator=(const ReadLock &) = delete;
+
   ~ReadLock() { this->mu_->ReadUnlock(); }
 
  private:
   port::RWMutex *const mu_;
-  // No copying allowed
-  ReadLock(const ReadLock&);
-  void operator=(const ReadLock&);
 };
 
 //
@@ -65,13 +67,14 @@ class ReadLock {
 class ReadUnlock {
  public:
   explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); }
+  // No copying allowed
+  ReadUnlock(const ReadUnlock &) = delete;
+  ReadUnlock &operator=(const ReadUnlock &) = delete;
+
   ~ReadUnlock() { mu_->ReadUnlock(); }
 
  private:
   port::RWMutex *const mu_;
-  // No copying allowed
-  ReadUnlock(const ReadUnlock &) = delete;
-  ReadUnlock &operator=(const ReadUnlock &) = delete;
 };
 
 //
@@ -84,13 +87,14 @@ class WriteLock {
   explicit WriteLock(port::RWMutex *mu) : mu_(mu) {
     this->mu_->WriteLock();
   }
+  // No copying allowed
+  WriteLock(const WriteLock &) = delete;
+  void operator=(const WriteLock &) = delete;
+
   ~WriteLock() { this->mu_->WriteUnlock(); }
 
  private:
   port::RWMutex *const mu_;
-  // No copying allowed
-  WriteLock(const WriteLock&);
-  void operator=(const WriteLock&);
 };
 
 //
diff --git a/util/ppc-opcode.h b/util/ppc-opcode.h
index e632ef26a3c..5cc5af0e30c 100644
--- a/util/ppc-opcode.h
+++ b/util/ppc-opcode.h
@@ -1,11 +1,9 @@
 //  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  Copyright (c) 2017 International Business Machines Corp.
 //  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//  This source code is also licensed under the GPLv2 license found in the
-//  COPYING file in the root directory of this source tree.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 
diff --git a/util/random.h b/util/random.h
index 2a5fcbc6ae9..f5beb9cd217 100644
--- a/util/random.h
+++ b/util/random.h
@@ -77,7 +77,51 @@ class Random {
   static Random* GetTLSInstance();
 };
 
-// A simple 64bit random number generator based on std::mt19937_64
+// A good 32-bit random number generator based on std::mt19937.
+// This exists in part to avoid compiler variance in warning about coercing
+// uint_fast32_t from mt19937 to uint32_t.
+class Random32 {
+ private:
+  std::mt19937 generator_;
+
+ public:
+  explicit Random32(uint32_t s) : generator_(s) {}
+
+  // Generates the next random number
+  uint32_t Next() { return static_cast<uint32_t>(generator_()); }
+
+  // Returns a uniformly distributed value in the range [0..n-1]
+  // REQUIRES: n > 0
+  uint32_t Uniform(uint32_t n) {
+    return static_cast<uint32_t>(
+        std::uniform_int_distribution<std::mt19937::result_type>(
+            0, n - 1)(generator_));
+  }
+
+  // Returns an *almost* uniformly distributed value in the range [0..n-1].
+  // Much faster than Uniform().
+  // REQUIRES: n > 0
+  uint32_t Uniformish(uint32_t n) {
+    // fastrange (without the header)
+    return static_cast<uint32_t>((uint64_t(generator_()) * uint64_t(n)) >> 32);
+  }
+
+  // Randomly returns true ~"1/n" of the time, and false otherwise.
+  // REQUIRES: n > 0
+  bool OneIn(uint32_t n) { return Uniform(n) == 0; }
+
+  // Skewed: pick "base" uniformly from range [0,max_log] and then
+  // return "base" random bits.  The effect is to pick a number in the
+  // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+  uint32_t Skewed(int max_log) {
+    return Uniform(uint32_t{1} << Uniform(max_log + 1));
+  }
+
+  // Reset the seed of the generator to the given value
+  void Seed(uint32_t new_seed) { generator_.seed(new_seed); }
+};
+
+// A good 64-bit random number generator based on std::mt19937_64
 class Random64 {
  private:
   std::mt19937_64 generator_;
diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc
index 9d23c38f7ac..0ee06a121ba 100644
--- a/util/rate_limiter.cc
+++ b/util/rate_limiter.cc
@@ -11,8 +11,8 @@
 #include "monitoring/statistics.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"
 #include "util/aligned_buffer.h"
-#include "util/sync_point.h"
 
 namespace rocksdb {
 
diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc
index d3f3be3ba95..f7809b989b0 100644
--- a/util/rate_limiter_test.cc
+++ b/util/rate_limiter_test.cc
@@ -7,21 +7,17 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "util/rate_limiter.h"
 
-#include <inttypes.h>
 #include <chrono>
+#include <cinttypes>
 #include <limits>
 
 #include "db/db_test_util.h"
 #include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/repeatable_thread.h b/util/repeatable_thread.h
index 967cc49945e..4226f35396c 100644
--- a/util/repeatable_thread.h
+++ b/util/repeatable_thread.h
@@ -10,11 +10,14 @@
 
 #include "port/port.h"
 #include "rocksdb/env.h"
-#include "util/mock_time_env.h"
+#include "test_util/mock_time_env.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
 
+// Simple wrapper around port::Thread that supports calling a callback every
+// X seconds. If you pass in 0, then it will call your callback repeatedly
+// without delay.
 class RepeatableThread {
  public:
   RepeatableThread(std::function<void()> function,
diff --git a/util/repeatable_thread_test.cc b/util/repeatable_thread_test.cc
index ee853c1056f..8a993e3699e 100644
--- a/util/repeatable_thread_test.cc
+++ b/util/repeatable_thread_test.cc
@@ -7,9 +7,9 @@
 #include <memory>
 
 #include "db/db_test_util.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 #include "util/repeatable_thread.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
 
 class RepeatableThreadTest : public testing::Test {
  public:
diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc
index f91675ccec8..96d90a9cd9b 100644
--- a/util/slice_transform_test.cc
+++ b/util/slice_transform_test.cc
@@ -14,7 +14,7 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/table.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/util/status.cc b/util/status.cc
index c66bf6f8e16..d75f9271eb8 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -21,7 +21,12 @@ const char* Status::CopyState(const char* state) {
 #ifdef OS_WIN
   const size_t cch = std::strlen(state) + 1;  // +1 for the null terminator
   char* result = new char[cch];
-  errno_t ret;
+  errno_t ret
+#if defined(_MSC_VER)
+    ;
+#else
+    __attribute__((__unused__));
+#endif
   ret = strncpy_s(result, cch, state, cch - 1);
   result[cch - 1] = '\0';
   assert(ret == 0);
@@ -43,6 +48,10 @@ static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
     "Memory limit reached",                               // kMemoryLimit
     "Space limit reached",                                // kSpaceLimit
     "No such file or directory",                          // kPathNotFound
+    // KMergeOperandsInsufficientCapacity
+    "Insufficient capacity for merge operands",
+    // kManualCompactionPaused
+    "Manual compaction paused",
 };
 
 Status::Status(Code _code, SubCode _subcode, const Slice& msg,
@@ -109,6 +118,9 @@ std::string Status::ToString() const {
     case kTryAgain:
       type = "Operation failed. Try again.: ";
       break;
+    case kColumnFamilyDropped:
+      type = "Column family dropped: ";
+      break;
     default:
       snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
                static_cast<int>(code()));
diff --git a/util/string_util.cc b/util/string_util.cc
index 26e6759ac2a..4a194f3a9cc 100644
--- a/util/string_util.cc
+++ b/util/string_util.cc
@@ -5,23 +5,19 @@
 //
 #include "util/string_util.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include <errno.h>
-#include <inttypes.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <algorithm>
+#include <cinttypes>
 #include <cmath>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
-#include "rocksdb/env.h"
 #include "port/port.h"
+#include "port/sys_time.h"
 #include "rocksdb/slice.h"
 
 namespace rocksdb {
@@ -143,6 +139,16 @@ std::string BytesToHumanString(uint64_t bytes) {
   return std::string(buf);
 }
 
+std::string TimeToHumanString(int unixtime) {
+  char time_buffer[80];
+  time_t rawtime = unixtime;
+  struct tm tInfo;
+  struct tm* timeinfo = localtime_r(&rawtime, &tInfo);
+  assert(timeinfo == &tInfo);
+  strftime(time_buffer, 80, "%c", timeinfo);
+  return std::string(time_buffer);
+}
+
 std::string EscapeString(const Slice& value) {
   std::string r;
   AppendEscapedStringTo(&r, value);
diff --git a/util/string_util.h b/util/string_util.h
index 6e125ddfa8f..122a6c3567e 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -50,6 +50,10 @@ extern std::string NumberToHumanString(int64_t num);
 // ex: 1048576 -> 1.00 GB
 extern std::string BytesToHumanString(uint64_t bytes);
 
+// Return a human-readable version of unix time
+// ex: 1562116015 -> "Tue Jul  2 18:06:55 2019"
+extern std::string TimeToHumanString(int unixtime);
+
 // Append a human-readable time in micros.
 int AppendHumanMicros(uint64_t micros, char* output, int len,
                       bool fixed_format);
@@ -117,7 +121,6 @@ uint64_t ParseUint64(const std::string& value);
 
 int ParseInt(const std::string& value);
 
-
 int64_t ParseInt64(const std::string& value);
 
 double ParseDouble(const std::string& value);
diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc
index a4a343a9cf4..37f59bab8ca 100644
--- a/util/thread_list_test.cc
+++ b/util/thread_list_test.cc
@@ -8,7 +8,7 @@
 
 #include "monitoring/thread_status_updater.h"
 #include "rocksdb/db.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 #ifdef ROCKSDB_USING_THREAD_STATUS
 
diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc
index 789be83d8fd..9926c391745 100644
--- a/util/thread_local_test.cc
+++ b/util/thread_local_test.cc
@@ -7,12 +7,12 @@
 #include <atomic>
 #include <string>
 
-#include "rocksdb/env.h"
 #include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/autovector.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 #include "util/thread_local.h"
 
 namespace rocksdb {
diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc
index ea5e875df21..8d0e01e0386 100644
--- a/util/threadpool_imp.cc
+++ b/util/threadpool_imp.cc
@@ -98,24 +98,23 @@ struct ThreadPoolImpl::Impl {
   void SetThreadPriority(Env::Priority priority) { priority_ = priority; }
 
 private:
-
-  static void* BGThreadWrapper(void* arg);
-
-  bool low_io_priority_;
-  bool low_cpu_priority_;
-  Env::Priority priority_;
-  Env*         env_;
-
-  int total_threads_limit_;
-  std::atomic_uint queue_len_;  // Queue length. Used for stats reporting
-  bool exit_all_threads_;
-  bool wait_for_jobs_to_complete_;
-
-  // Entry per Schedule()/Submit() call
-  struct BGItem {
-    void* tag = nullptr;
-    std::function<void()> function;
-    std::function<void()> unschedFunction;
+ static void BGThreadWrapper(void* arg);
+
+ bool low_io_priority_;
+ bool low_cpu_priority_;
+ Env::Priority priority_;
+ Env* env_;
+
+ int total_threads_limit_;
+ std::atomic_uint queue_len_;  // Queue length. Used for stats reporting
+ bool exit_all_threads_;
+ bool wait_for_jobs_to_complete_;
+
+ // Entry per Schedule()/Submit() call
+ struct BGItem {
+   void* tag = nullptr;
+   std::function<void()> function;
+   std::function<void()> unschedFunction;
   };
 
   using BGQueue = std::deque<BGItem>;
@@ -275,7 +274,7 @@ struct BGThreadMetadata {
       : thread_pool_(thread_pool), thread_id_(thread_id) {}
 };
 
-void* ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
+void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
   BGThreadMetadata* meta = reinterpret_cast<BGThreadMetadata*>(arg);
   size_t thread_id = meta->thread_id_;
   ThreadPoolImpl::Impl* tp = meta->thread_pool_;
@@ -298,7 +297,7 @@ void* ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
       break;
     case Env::Priority::TOTAL:
       assert(false);
-      return nullptr;
+      return;
   }
   assert(thread_type != ThreadStatus::NUM_THREAD_TYPES);
   ThreadStatusUtil::RegisterThread(tp->GetHostEnv(), thread_type);
@@ -308,14 +307,13 @@ void* ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
 #ifdef ROCKSDB_USING_THREAD_STATUS
   ThreadStatusUtil::UnregisterThread();
 #endif
-  return nullptr;
+  return;
 }
 
 void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num,
   bool allow_reduce) {
-  std::unique_lock<std::mutex> lock(mu_);
+  std::lock_guard<std::mutex> lock(mu_);
   if (exit_all_threads_) {
-    lock.unlock();
     return;
   }
   if (num > total_threads_limit_ ||
diff --git a/util/timer_queue.h b/util/timer_queue.h
index bd8a4f85048..a5f74ae5679 100644
--- a/util/timer_queue.h
+++ b/util/timer_queue.h
@@ -32,7 +32,7 @@
 #include <vector>
 
 #include "port/port.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 // Allows execution of handlers at a specified time in the future
 // Guarantees:
diff --git a/util/trace_replay.h b/util/trace_replay.h
deleted file mode 100644
index 749ea2f6432..00000000000
--- a/util/trace_replay.h
+++ /dev/null
@@ -1,102 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#include <memory>
-#include <unordered_map>
-#include <utility>
-
-#include "rocksdb/env.h"
-#include "rocksdb/options.h"
-#include "rocksdb/trace_reader_writer.h"
-
-namespace rocksdb {
-
-class ColumnFamilyHandle;
-class ColumnFamilyData;
-class DB;
-class DBImpl;
-class Slice;
-class WriteBatch;
-
-extern const std::string kTraceMagic;
-const unsigned int kTraceTimestampSize = 8;
-const unsigned int kTraceTypeSize = 1;
-const unsigned int kTracePayloadLengthSize = 4;
-const unsigned int kTraceMetadataSize =
-    kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
-
-enum TraceType : char {
-  kTraceBegin = 1,
-  kTraceEnd = 2,
-  kTraceWrite = 3,
-  kTraceGet = 4,
-  kTraceIteratorSeek = 5,
-  kTraceIteratorSeekForPrev = 6,
-  kTraceMax,
-};
-
-// TODO: This should also be made part of public interface to help users build
-// custom TracerReaders and TraceWriters.
-struct Trace {
-  uint64_t ts;
-  TraceType type;
-  std::string payload;
-
-  void reset() {
-    ts = 0;
-    type = kTraceMax;
-    payload.clear();
-  }
-};
-
-// Trace RocksDB operations using a TraceWriter.
-class Tracer {
- public:
-  Tracer(Env* env, const TraceOptions& trace_options,
-         std::unique_ptr<TraceWriter>&& trace_writer);
-  ~Tracer();
-
-  Status Write(WriteBatch* write_batch);
-  Status Get(ColumnFamilyHandle* cfname, const Slice& key);
-  Status IteratorSeek(const uint32_t& cf_id, const Slice& key);
-  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
-  bool IsTraceFileOverMax();
-
-  Status Close();
-
- private:
-  Status WriteHeader();
-  Status WriteFooter();
-  Status WriteTrace(const Trace& trace);
-  bool ShouldSkipTrace(const TraceType& type);
-
-  Env* env_;
-  TraceOptions trace_options_;
-  std::unique_ptr<TraceWriter> trace_writer_;
-  uint64_t trace_request_count_;
-};
-
-// Replay RocksDB operations from a trace.
-class Replayer {
- public:
-  Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
-           std::unique_ptr<TraceReader>&& reader);
-  ~Replayer();
-
-  Status Replay();
-
- private:
-  Status ReadHeader(Trace* header);
-  Status ReadFooter(Trace* footer);
-  Status ReadTrace(Trace* trace);
-
-  DBImpl* db_;
-  std::unique_ptr<TraceReader> trace_reader_;
-  std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
-};
-
-}  // namespace rocksdb
diff --git a/util/vector_iterator.h b/util/vector_iterator.h
index 2a88c260314..d5945b6b7f3 100644
--- a/util/vector_iterator.h
+++ b/util/vector_iterator.h
@@ -20,7 +20,7 @@ class VectorIterator : public InternalIterator {
       : keys_(std::move(keys)),
         values_(std::move(values)),
         indexed_cmp_(icmp, &keys_),
-        current_(keys.size()) {
+        current_(0) {
     assert(keys_.size() == values_.size());
 
     indices_.reserve(keys_.size());
diff --git a/util/xxh3p.h b/util/xxh3p.h
new file mode 100644
index 00000000000..0a3cd980810
--- /dev/null
+++ b/util/xxh3p.h
@@ -0,0 +1,1648 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Development source file for `xxh3`
+   Copyright (C) 2019-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* RocksDB Note: This file contains a preview release (xxhash repository
+   version 0.7.2) of XXH3 that is unlikely to be compatible with the final
+   version of XXH3. We have therefore renamed this XXH3p ("preview"), for
+   clarity so that we can continue to use this version even after
+   integrating a newer incompatible version.
+*/
+
+/* Note :
+   This file is separated for development purposes.
+   It will be integrated into `xxhash.c` when development phase is complete.
+*/
+
+#ifndef XXH3p_H
+#define XXH3p_H
+
+
+/* ===   Dependencies   === */
+
+#undef XXH_INLINE_ALL   /* in case it's already defined */
+#define XXH_INLINE_ALL
+#include "xxhash.h"
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * Sanity check.
+ *
+ * XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *
+ * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
+ * classic 16-bit only subset of ARM's instruction set.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we
+ * will give a warning.
+ *
+ * Usually, if this happens, it is because of an accident and you probably
+ * need to specify -march, as you probably meant to compileh for a newer
+ * architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0
+#define XXH_SSE2   1
+#define XXH_AVX2   2
+#define XXH_NEON   3
+#define XXH_VSX    4
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* control alignment of accumulator,
+ * for compatibility with fast vector loads */
+#ifndef XXH_ACC_ALIGN
+#  if XXH_VECTOR == 0   /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == 1  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == 2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == 3  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == 4  /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  endif
+#endif
+
+/* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu(x, y)
+#else
+#    define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
+#endif
+
+/* VSX stuff. It's a lot because VSX support is mediocre across compilers and
+ * there is a lot of mischief with endianness. */
+#if XXH_VECTOR == XXH_VSX
+#  include <altivec.h>
+#  undef vector
+typedef __vector unsigned long long U64x2;
+typedef __vector unsigned char U8x16;
+typedef __vector unsigned U32x4;
+
+#ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+#endif
+
+/* We need some helpers for big endian mode. */
+#if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  ifdef __POWER9_VECTOR__
+#    define XXH_vec_revb vec_revb
+#  else
+XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
+{
+    U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                              0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+
+/* Power8 Crypto gives us vpermxor which is very handy for
+ * PPC64EB.
+ *
+ * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
+ * {
+ *     U8x16 ret;
+ *     for (int i = 0; i < 16; i++) {
+ *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
+ *     }
+ *     return ret;
+ * }
+ *
+ * Because both of the main loops load the key, swap, and xor it with input,
+ * we can combine the key swap into this instruction.
+ */
+#  ifdef vec_permxor
+#    define XXH_vec_permxor vec_permxor
+#  else
+#    define XXH_vec_permxor __builtin_crypto_vpermxor
+#  endif
+#endif  /* XXH_VSX_BE */
+/*
+ * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
+ * vec_mule.
+ *
+ * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
+ * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
+ * GCC needs inline assembly. */
+#if __has_builtin(__builtin_altivec_vmuleuw)
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+#else
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+#endif /* __has_builtin(__builtin_altivec_vmuleuw) */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3p_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3p_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * GCC for x86 has a tendency to use SSE in this loop. While it
+ * successfully avoids swapping (as MUL overwrites EAX and EDX), it
+ * slows it down because instead of free register swap shifts, it
+ * must use pshufd and punpckl/hd.
+ *
+ * To prevent this, we use this attribute to shut off SSE.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this
+     * type despite not having the arithmetic for it. This results in a
+     * laggy compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t const r128 = { product_low, product_high };
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown
+     * below with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
+     *     ---------
+     *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
+     *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
+     *     ---------
+     *       6 9 7 5
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for
+     *     UINT64_MAX. This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARMv6+ A32/T32, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication,
+     *     and allows this to be calculated in only 4 instructions which
+     *     is comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be
+     *     a couple of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128 = { lower, upper };
+    return r128;
+#endif
+}
+
+/*
+ * We want to keep the attribute here because a target switch
+ * disables inlining.
+ *
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ * The reason for the separate function is to prevent passing
+ * too many structs around by value. This will hopefully inline
+ * the multiply, but we don't force it.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static xxh_u64
+XXH3p_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+
+static XXH64_hash_t XXH3p_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 37;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32  const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
+        xxh_u64  const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
+        xxh_u64  const mixed = keyed * PRIME64_1;
+        return XXH3p_avalanche(mixed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
+        xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
+        xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
+        return XXH3p_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const input_lo = XXH_readLE64(input)           ^ (XXH_readLE64(secret)     + seed);
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
+        xxh_u64 const acc = len + (input_lo + input_hi) + XXH3p_mul128_fold64(input_lo, input_hi);
+        return XXH3p_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3p_len_9to16_64b(input, len, secret, seed);
+        if (len >= 4) return XXH3p_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3p_len_1to3_64b(input, len, secret, seed);
+        /*
+         * RocksDB modification from XXH3 preview: zero result for empty
+         * string can be problematic for multiplication-based algorithms.
+         * Return a hash of the seed instead.
+         */
+        return XXH3p_mul128_fold64(seed + XXH_readLE64(secret), PRIME64_2);
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXH3p_acc_64bits, XXH3p_acc_128bits } XXH3p_accWidth_e;
+
+XXH_FORCE_INLINE void
+XXH3p_accumulate_512(      void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret,
+                    XXH3p_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
+        const         __m256i* const xinput = (const __m256i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
+            __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
+            __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXH3p_acc_128bits) {
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            } else {  /* XXH3p_acc_64bits */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
+        const         __m128i* const xinput = (const __m128i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            __m128i const data_vec = _mm_loadu_si128 (xinput+i);
+            __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
+            __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXH3p_acc_128bits) {
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            } else {  /* XXH3p_acc_64bits */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
+            /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
+             * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
+             * assumes I don't want to destroy it and tries to make a copy. This slows down the code
+             * a lot.
+             * aarch64 not only uses an entirely different syntax, but it requires three
+             * instructions...
+             *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
+             *    zip1   v3.2s, v0.2s, v1.2s   // first zip
+             *    zip2   v2.2s, v0.2s, v1.2s   // second zip
+             * ...to do what ARM does in one:
+             *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t       data_key;
+
+            if (accWidth == XXH3p_acc_64bits) {
+                /* Add first to prevent register swaps */
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3p_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                /* can probably be optimized better */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+
+            data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
+
+            /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
+             * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
+            __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
+            /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
+
+#else
+            /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
+            uint32x2_t const data_key_lo = vmovn_u64  (data_key);
+            /* data_key_hi = (uint32x2_t) (data_key >> 32); */
+            uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
+            if (accWidth == XXH3p_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3p_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+#endif
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_VSX)
+          U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
+    U64x2 const* const xinput = (U64x2 const*) input;   /* no alignment restriction */
+    U64x2 const* const xsecret  = (U64x2 const*) secret;    /* no alignment restriction */
+    U64x2 const v32 = { 32,  32 };
+#if XXH_VSX_BE
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        /* data_vec = xinput[i]; */
+        /* key_vec = xsecret[i]; */
+#if XXH_VSX_BE
+        /* byteswap */
+        U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
+        U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
+        /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
+        U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
+        U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
+        /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
+        U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXH3p_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXH3p_acc_128bits */
+            /* swap high and low halves */
+            U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xinput = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret  = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXH3p_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+XXH_FORCE_INLINE void
+XXH3p_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
+        uint8_t const* const xsecret = (uint8_t const*) secret;
+        uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
+            uint64x2_t const   acc_vec  = xacc[i];
+            uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* key_vec  = xsecret[i]; */
+            uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
+            /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
+            uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
+
+            /* data_key *= PRIME32_1 */
+
+            /* prod_hi = (data_key >> 32) * PRIME32_1; */
+            uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
+            /* xacc[i] = prod_hi << 32; */
+            xacc[i] = vshlq_n_u64(prod_hi, 32);
+            /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+            xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
+    }   }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+          U64x2* const xacc =       (U64x2*) acc;
+    const U64x2* const xsecret = (const U64x2*) secret;
+    /* constants */
+    U64x2 const v32  = { 32, 32 };
+    U64x2 const v47 = { 47, 47 };
+    U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+    size_t i;
+#if XXH_VSX_BE
+    /* endian swap */
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        U64x2 const acc_vec  = xacc[i];
+        U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+        /* key_vec = xsecret[i]; */
+#if XXH_VSX_BE
+        /* swap bytes words */
+        U64x2 const key_raw  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const key_vec  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+
+        /* data_key *= PRIME32_1 */
+
+        /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
+        U64x2 const prod_even  = XXH_vec_mule((U32x4)data_key, prime);
+        /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
+        U64x2 const prod_odd  = XXH_vec_mulo((U32x4)data_key, prime);
+        xacc[i] = prod_odd + (prod_even << v32);
+    }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 ^= acc64 >> 47;
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXH_PREFETCH_DIST 384
+
+/* assumption : nbStripes will not overflow secret size */
+XXH_FORCE_INLINE void
+XXH3p_accumulate(       xxh_u64* XXH_RESTRICT acc,
+                const xxh_u8* XXH_RESTRICT input,
+                const xxh_u8* XXH_RESTRICT secret,
+                      size_t nbStripes,
+                      XXH3p_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+        XXH3p_accumulate_512(acc,
+                            in,
+                            secret + n*XXH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
+ *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
+ *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
+ *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
+ */
+
+#if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
+static void
+#else
+XXH_FORCE_INLINE void
+#endif
+XXH3p_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3p_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3p_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXH3p_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3p_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
+            XXH3p_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3p_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3p_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3p_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+
+    result64 += XXH3p_mix2Accs(acc+0, secret +  0);
+    result64 += XXH3p_mix2Accs(acc+2, secret + 16);
+    result64 += XXH3p_mix2Accs(acc+4, secret + 32);
+    result64 += XXH3p_mix2Accs(acc+6, secret + 48);
+
+    return XXH3p_avalanche(result64);
+}
+
+#define XXH3p_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
+
+    XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_64bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+#define XXH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
+{
+    return XXH3p_hashLong_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    return XXH3p_hashLong_internal(input, len, secret, secretSize);
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3p_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3p_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+    for (i=0; i < nbRounds; i++) {
+        XXH_writeLE64(customSecret + 16*i,     XXH_readLE64(kSecret + 16*i)     + seed64);
+        XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
+    }
+}
+
+
+/* XXH3p_hashLong_64b_withSeed() :
+ * Generate a custom key,
+ * based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ */
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXH3p_hashLong_64b_defaultSecret(input, len);
+    XXH3p_initCustomSecret(secret, seed);
+    return XXH3p_hashLong_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE xxh_u64 XXH3p_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                 const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+    xxh_u64 const input_lo = XXH_readLE64(input);
+    xxh_u64 const input_hi = XXH_readLE64(input+8);
+    return XXH3p_mul128_fold64(
+               input_lo ^ (XXH_readLE64(secret)   + seed64),
+               input_hi ^ (XXH_readLE64(secret+8) - seed64) );
+}
+
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3p_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3p_mix16B(input+48, secret+96, seed);
+                    acc += XXH3p_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3p_mix16B(input+32, secret+64, seed);
+                acc += XXH3p_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3p_mix16B(input+16, secret+32, seed);
+            acc += XXH3p_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3p_mix16B(input+0, secret+0, seed);
+        acc += XXH3p_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3p_avalanche(acc);
+    }
+}
+
+#define XXH3p_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3p_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
+
+    #define XXH3p_MIDSIZE_STARTOFFSET 3
+    #define XXH3p_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3p_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3p_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3p_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3p_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3p_mix16B(input + len - 16, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET, seed);
+        return XXH3p_avalanche(acc);
+    }
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* input, size_t len)
+{
+    if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3p_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3p_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+     if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+     if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     return XXH3p_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3p_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16) return XXH3p_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128) return XXH3p_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3p_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXH3 streaming   === */
+
+XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void)
+{
+    return (XXH3p_state_t*)XXH_malloc(sizeof(XXH3p_state_t));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3p_64bits_reset_internal(XXH3p_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->acc[0] = PRIME32_3;
+    statePtr->acc[1] = PRIME64_1;
+    statePtr->acc[2] = PRIME64_2;
+    statePtr->acc[3] = PRIME64_3;
+    statePtr->acc[4] = PRIME64_4;
+    statePtr->acc[5] = PRIME32_2;
+    statePtr->acc[6] = PRIME64_5;
+    statePtr->acc[7] = PRIME32_1;
+    statePtr->seed = seed;
+    XXH_ASSERT(secret != NULL);
+    statePtr->secret = secret;
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_reset(XXH3p_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3p_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3p_consumeStripes( xxh_u64* acc,
+                    XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+                    const xxh_u8* input, size_t totalStripes,
+                    const xxh_u8* secret, size_t secretLimit,
+                    XXH3p_accWidth_e accWidth)
+{
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+        XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+        XXH3p_scrambleAcc(acc, secret + secretLimit);
+        XXH3p_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+    } else {
+        XXH3p_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+    }
+}
+
+XXH_FORCE_INLINE XXH_errorcode
+XXH3p_update(XXH3p_state_t* state, const xxh_u8* input, size_t len, XXH3p_accWidth_e accWidth)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+
+        state->totalLen += len;
+
+        if (state->bufferedSize + len <= XXH3p_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+        /* input now > XXH3p_INTERNALBUFFER_SIZE */
+
+        #define XXH3p_INTERNALBUFFER_STRIPES (XXH3p_INTERNALBUFFER_SIZE / STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3p_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
+
+        if (state->bufferedSize) {   /* some input within internal buffer: fill then consume it */
+            size_t const loadSize = XXH3p_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3p_consumeStripes(state->acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3p_INTERNALBUFFER_STRIPES,
+                                state->secret, state->secretLimit,
+                                accWidth);
+            state->bufferedSize = 0;
+        }
+
+        /* consume input by full buffer quantities */
+        if (input+XXH3p_INTERNALBUFFER_SIZE <= bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3p_INTERNALBUFFER_SIZE;
+            do {
+                XXH3p_consumeStripes(state->acc,
+                                   &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3p_INTERNALBUFFER_STRIPES,
+                                    state->secret, state->secretLimit,
+                                    accWidth);
+                input += XXH3p_INTERNALBUFFER_SIZE;
+            } while (input<=limit);
+        }
+
+        if (input < bEnd) { /* some remaining input input : buffer it */
+            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_64bits_update(XXH3p_state_t* state, const void* input, size_t len)
+{
+    return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3p_digest_long (XXH64_hash_t* acc, const XXH3p_state_t* state, XXH3p_accWidth_e accWidth)
+{
+    memcpy(acc, state->acc, sizeof(state->acc));  /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
+    if (state->bufferedSize >= STRIPE_LEN) {
+        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3p_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, totalNbStripes,
+                            state->secret, state->secretLimit,
+                            accWidth);
+        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
+            XXH3p_accumulate_512(acc,
+                                state->buffer + state->bufferedSize - STRIPE_LEN,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }
+    } else {  /* bufferedSize < STRIPE_LEN */
+        if (state->bufferedSize) { /* one last stripe */
+            xxh_u8 lastStripe[STRIPE_LEN];
+            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+            XXH3p_accumulate_512(acc,
+                                lastStripe,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+    }   }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_digest (const XXH3p_state_t* state)
+{
+    if (state->totalLen > XXH3p_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3p_digest_long(acc, state, XXH3p_acc_64bits);
+        return XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
+    }
+    /* len <= XXH3p_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3p_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3p_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (=> XXH128)
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32  const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
+        xxh_u32  const combinedh = XXH_swap32(combinedl);
+        xxh_u64  const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret)   + seed);
+        xxh_u64  const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
+        xxh_u64  const mixedl = keyed_lo * PRIME64_1;
+        xxh_u64  const mixedh = keyed_hi * PRIME64_5;
+        XXH128_hash_t const h128 = { XXH3p_avalanche(mixedl) /*low64*/, XXH3p_avalanche(mixedh) /*high64*/ };
+        return h128;
+    }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
+        xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
+        xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
+        xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
+        xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
+        xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
+        xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
+        {   XXH128_hash_t const h128 = { XXH3p_avalanche(mix64l2) /*low64*/, XXH3p_avalanche(mix64h2) /*high64*/ };
+            return h128;
+    }   }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
+        xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
+        m128.low64 += lenContrib;
+        m128.high64 += input_hi * PRIME64_1;
+        m128.low64  ^= (m128.high64 >> 32);
+        {   XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+            h128.high64 += m128.high64 * PRIME64_2;
+            h128.low64   = XXH3p_avalanche(h128.low64);
+            h128.high64  = XXH3p_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/* Assumption : `secret` size is >= 16
+ * Note : it should be >= XXH3p_SECRET_SIZE_MIN anyway */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3p_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3p_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3p_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t const h128 = { 0, 0 };
+            return h128;
+    }   }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3p_INIT_ACC;
+
+    XXH3p_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3p_acc_128bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+        xxh_u64 const high64 = XXH3p_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
+        XXH128_hash_t const h128 = { low64, high64 };
+        return h128;
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
+{
+    return XXH3p_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
+                              const xxh_u8* secret, size_t secretSize)
+{
+    return XXH3p_hashLong_128b_internal(input, len, secret, secretSize);
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3p_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3p_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed == 0) return XXH3p_hashLong_128b_defaultSecret(input, len);
+    XXH3p_initCustomSecret(secret, seed);
+    return XXH3p_hashLong_128b_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3p_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3p_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3p_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3p_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
+        }
+        acc.low64 = XXH3p_avalanche(acc.low64);
+        acc.high64 = XXH3p_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3p_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3p_SECRET_SIZE_MIN - XXH3p_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
+
+        {   xxh_u64 const low64 = acc.low64 + acc.high64;
+            xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+            XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
+            return h128;
+        }
+    }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3p_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   xxh_u64 const low64 = acc.low64 + acc.high64;
+            xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+            XXH128_hash_t const h128 = { XXH3p_avalanche(low64), (XXH64_hash_t)0 - XXH3p_avalanche(high64) };
+            return h128;
+        }
+    }
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* input, size_t len)
+{
+    if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3p_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3p_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3p_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+     if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+     if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+     return XXH3p_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3p_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16) return XXH3p_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128) return XXH3p_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3p_MIDSIZE_MAX) return XXH3p_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3p_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3p_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/* all the functions are actually the same as for 64-bit streaming variant,
+   just the reset one is different (different initial acc values for 0,5,6,7),
+   and near the end of the digest function */
+
+static void
+XXH3p_128bits_reset_internal(XXH3p_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH3p_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_reset(XXH3p_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3p_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3p_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3p_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3p_128bits_update(XXH3p_state_t* state, const void* input, size_t len)
+{
+    return XXH3p_update(state, (const xxh_u8*)input, len, XXH3p_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* state)
+{
+    if (state->totalLen > XXH3p_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3p_digest_long(acc, state, XXH3p_acc_128bits);
+        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   xxh_u64 const low64 = XXH3p_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
+            xxh_u64 const high64 = XXH3p_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * PRIME64_2));
+            XXH128_hash_t const h128 = { low64, high64 };
+            return h128;
+        }
+    }
+    /* len <= XXH3p_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3p_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3p_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+#endif  /* XXH3p_H */
diff --git a/util/xxhash.cc b/util/xxhash.cc
index 91343f674e7..6620ae8b686 100644
--- a/util/xxhash.cc
+++ b/util/xxhash.cc
@@ -1,97 +1,82 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 /*
-xxHash - Fast Hash algorithm
-Copyright (C) 2012-2014, Yann Collet.
-BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-* Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-You can contact the author at :
-- xxHash source repository : http://code.google.com/p/xxhash/
+*  xxHash - Fast Hash algorithm
+*  Copyright (C) 2012-2016, Yann Collet
+*
+*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions are
+*  met:
+*
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following disclaimer
+*  in the documentation and/or other materials provided with the
+*  distribution.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*  You can contact the author at :
+*  - xxHash homepage: http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 
-//**************************************
-// Tuning parameters
-//**************************************
+/* since xxhash.c can be included (via XXH_INLINE_ALL),
+ * it's good practice to protect it with guard
+ * in case of multiples inclusions */
+#ifndef XXHASH_C_01393879
+#define XXHASH_C_01393879
+
+/* *************************************
+*  Tuning parameters
+***************************************/
 /*!XXH_FORCE_MEMORY_ACCESS :
- * By default, access to unaligned memory is controlled by `memcpy()`, which is
- * safe and portable. Unfortunately, on some target/compiler combinations, the
- * generated assembly is sub-optimal. The below switch allow to select different
- * access method for improved performance. Method 0 (default) : use `memcpy()`.
- * Safe and portable. Method 1 : `__packed` statement. It depends on compiler
- * extension (ie, not portable). This method is safe if your compiler supports
- * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct
- * access. This method doesn't depend on compiler but violate C standard. It can
- * generate buggy code on targets which do not support unaligned memory
- * accesses. But in some circumstances, it's the only known way to get the most
- * performance (ie GCC + ARMv6) See http://stackoverflow.com/a/32095106/646947
- * for details. Prefer these methods in priority order (0 > 1 > 2)
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
  */
-
-#include "util/util.h"
-
-#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line \
-                                   for example */
-#if defined(__GNUC__) &&                                     \
-    (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) ||  \
-     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
-     defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__))
-#define XXH_FORCE_MEMORY_ACCESS 2
-#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) ||      \
-    (defined(__GNUC__) &&                                     \
-     (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) ||  \
-      defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \
-      defined(__ARM_ARCH_7S__)))
-#define XXH_FORCE_MEMORY_ACCESS 1
-#endif
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
 #endif
 
-// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
-// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
-// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
-// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32).
-#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#  define XXH_USE_UNALIGNED_ACCESS 1
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+ * When this macro is enabled, xxHash actively checks input for null pointer.
+ * It it is, result for null input pointers is the same as a null-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
 #endif
 
-// XXH_ACCEPT_NULL_INPUT_POINTER :
-// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
-// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
-// This option has a very small performance cost (only measurable on small inputs).
-// By default, this option is disabled. To enable it, uncomment below define :
-//#define XXH_ACCEPT_NULL_INPUT_POINTER 1
-
-// XXH_FORCE_NATIVE_FORMAT :
-// By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
-// Results are therefore identical for little-endian and big-endian CPU.
-// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independence be of no importance for your application, you may set the #define below to 1.
-// It will improve speed for Big-endian CPU.
-// This option has no impact on Little_Endian CPU.
-#define XXH_FORCE_NATIVE_FORMAT 0
-
 /*!XXH_FORCE_ALIGN_CHECK :
  * This is a minor performance trick, only useful with lots of very small keys.
  * It means : check for aligned/unaligned input.
@@ -100,976 +85,1076 @@ You can contact the author at :
  * or when alignment doesn't matter for performance.
  */
 #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
-#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \
-    defined(_M_X64)
-#define XXH_FORCE_ALIGN_CHECK 0
-#else
-#define XXH_FORCE_ALIGN_CHECK 1
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
 #endif
+
+/*!XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang. */
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
 #endif
 
-//**************************************
-// Compiler Specific Options
-//**************************************
-// Disable some Visual warning messages
-#ifdef _MSC_VER  // Visual Studio
-#  pragma warning(disable : 4127)      // disable: C4127: conditional expression is constant
-#  pragma warning(disable : 4804)      // disable: C4804: 'operation' : unsafe use of type 'bool' in operation (static assert line 313)
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*! Modify the local functions below should you wish to use some other memory routines
+*   for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#include <limits.h>   /* ULLONG_MAX */
+
+#ifndef XXH_STATIC_LINKING_ONLY
+#define XXH_STATIC_LINKING_ONLY
 #endif
 
-#ifdef _MSC_VER    // Visual Studio
-#  define FORCE_INLINE static __forceinline
+#include "xxhash.h"
+
+/* BEGIN RocksDB customizations */
+#include "util/util.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */
+/* END RocksDB customizations */
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
 #else
-#  ifdef __GNUC__
-#    define FORCE_INLINE static inline __attribute__((always_inline))
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
+#    endif
 #  else
-#    define FORCE_INLINE static inline
-#  endif
+#    define XXH_FORCE_INLINE static
+#    define XXH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
 #endif
 
 
-//**************************************
-// Includes & Memory related functions
-//**************************************
-#include "xxhash.h"
-// Modify the local functions below should you wish to use some other memory related routines
-// for malloc(), free()
-#include <stdlib.h>
-FORCE_INLINE void* XXH_malloc(size_t s) { return malloc(s); }
-FORCE_INLINE void  XXH_free  (void* p)  { free(p); }
-// for memcpy()
-#include <string.h>
-FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
-#include <assert.h> /* assert */
-
-namespace rocksdb {
-//**************************************
-// Basic Types
-//**************************************
-#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
-# include <stdint.h>
-  typedef uint8_t  BYTE;
-  typedef uint16_t U16;
-  typedef uint32_t U32;
-  typedef  int32_t S32;
-  typedef uint64_t U64;
-#else
-  typedef unsigned char      BYTE;
-  typedef unsigned short     U16;
-  typedef unsigned int       U32;
-  typedef   signed int       S32;
-  typedef unsigned long long U64;
+
+/* *************************************
+*  Debug
+***************************************/
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
 #endif
 
-#if defined(__GNUC__)  && !defined(XXH_USE_UNALIGNED_ACCESS)
-#  define _PACKED __attribute__ ((packed))
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note : can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
 #else
-#  define _PACKED
+#  define XXH_ASSERT(c)   ((void)0)
 #endif
 
-#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
-#  ifdef __IBMC__
-#    pragma pack(1)
-#  else
-#    pragma pack(push, 1)
-#  endif
-#endif
+/* note : use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  { enum { XXH_sa = 1/(int)(!!(c)) }; }
 
-typedef struct _U32_S { U32 v; } _PACKED U32_S;
 
-#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__)
-#  pragma pack(pop)
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
 #endif
+typedef XXH32_hash_t xxh_u32;
+
 
-#define A32(x) (((U32_S *)(x))->v)
+/* ===   Memory access   === */
 
-#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
 
-/* Force direct memory access. Only works on CPU which support unaligned memory
- * access in hardware */
-static U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; }
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
 
-#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
 
-/* __pack instructions are safer, but compiler specific, hence potentially
- * problematic for some compilers */
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
 /* currently only defined for gcc and icc */
-typedef union {
-  U32 u32;
-} __attribute__((packed)) unalign;
-static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
 
 #else
 
 /* portable and safe solution. Generally efficient.
  * see : http://stackoverflow.com/a/32095106/646947
  */
-static U32 XXH_read32(const void* memPtr) {
-  U32 val;
-  memcpy(&val, memPtr, sizeof(val));
-  return val;
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
 }
 
-#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
 
-//***************************************
-// Compiler-specific Functions and Macros
-//***************************************
-#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+/* ===   Endianess   === */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXH_isLittleEndian(void)
+{
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
 
-// Note : although _rotl exists for minGW (GCC under windows), performance seems poor
-#if defined(_MSC_VER)
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
-#define XXH_rotl64(x, r) _rotl64(x, r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
 #else
-#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
-#define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r)))
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
 #endif
 
-#if defined(_MSC_VER)     // Visual Studio
+#if defined(_MSC_VER)     /* Visual Studio */
 #  define XXH_swap32 _byteswap_ulong
-#elif GCC_VERSION >= 403
+#elif XXH_GCC_VERSION >= 403
 #  define XXH_swap32 __builtin_bswap32
 #else
-static inline U32 XXH_swap32 (U32 x) {
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
     return  ((x << 24) & 0xff000000 ) |
-        ((x <<  8) & 0x00ff0000 ) |
-        ((x >>  8) & 0x0000ff00 ) |
-        ((x >> 24) & 0x000000ff );}
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
 #endif
 
 
-//**************************************
-// Constants
-//**************************************
-#define PRIME32_1   2654435761U
-#define PRIME32_2   2246822519U
-#define PRIME32_3   3266489917U
-#define PRIME32_4    668265263U
-#define PRIME32_5    374761393U
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
 
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
 
-//**************************************
-// Architecture Macros
-//**************************************
-typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
-#ifndef XXH_CPU_LITTLE_ENDIAN   // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch
-    static const int one = 1;
-#   define XXH_CPU_LITTLE_ENDIAN   (*(char*)(&one))
-#endif
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
 
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
 
-//**************************************
-// Macros
-//**************************************
-#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(!!(c)) }; }    // use only *after* variable declarations
 
+/* *************************************
+*  Misc
+***************************************/
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
 
-//****************************
-// Memory reads
-//****************************
-typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
 
-FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_alignment align)
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
 {
-    if (align==XXH_unaligned)
-        return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr));
-    else
-        return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr);
-}
-
-FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian,
-                                    XXH_alignment align) {
-  if (align == XXH_unaligned)
-    return endian == XXH_littleEndian ? XXH_read32(ptr)
-                                      : XXH_swap32(XXH_read32(ptr));
-  else
-    return endian == XXH_littleEndian ? *(const U32*)ptr
-                                      : XXH_swap32(*(const U32*)ptr);
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /* UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32 loop
+     * (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on newer chips!)
+     *   making it slightly slower to multiply four integers at once compared to four
+     *   integers independently. Even when pmulld was fastest, Sandy/Ivy Bridge, it is
+     *   still not worth it to go into SSE just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because the
+     *   SIMD actually serializes this operation: While v1 is rotating, v2 can load data,
+     *   while v3 can multiply. SSE forces them to operate together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize. */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
 }
 
-FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) {
-  return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+/* mix all bits */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
 }
 
-//****************************
-// Simple Hash Functions
-//****************************
-#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
 
-FORCE_INLINE U32 XXH32_endian_align(const void* input, int len, U32 seed, XXH_endianess endian, XXH_alignment align)
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
 {
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
-    U32 h32;
-
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; }
-#endif
-
-    if (len>=16)
-    {
-        const BYTE* const limit = bEnd - 16;
-        U32 v1 = seed + PRIME32_1 + PRIME32_2;
-        U32 v2 = seed + PRIME32_2;
-        U32 v3 = seed + 0;
-        U32 v4 = seed - PRIME32_1;
-
-        do
-        {
-            v1 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-            v2 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-            v3 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-            v4 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-        } while (p<=limit);
-
-        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
-    }
-    else
-    {
-        h32  = seed + PRIME32_5;
+#define PROCESS1               \
+    h32 += (*ptr++) * PRIME32_5; \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+
+#define PROCESS4                         \
+    h32 += XXH_get32bits(ptr) * PRIME32_3; \
+    ptr+=4;                                \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 8:       PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 4:       PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 9:       PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 5:       PROCESS4;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 10:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 6:       PROCESS4;
+                         PROCESS1;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 11:      PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 7:       PROCESS4;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 3:       PROCESS1;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 2:       PROCESS1;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 1:       PROCESS1;
+                         FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
     }
+}
 
-    h32 += (U32) len;
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u32 h32;
 
-    while (p<=bEnd-4)
-    {
-        h32 += XXH_readLE32_align((const U32*)p, endian, align) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
-        p+=4;
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
     }
+#endif
 
-    while (p<bEnd)
-    {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-        p++;
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
+        xxh_u32 v2 = seed + PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
     }
 
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
+    h32 += (xxh_u32)len;
 
-    return h32;
+    return XXH32_finalize(h32, input, len&15, align);
 }
 
 
-U32 XXH32(const void* input, int len, U32 seed)
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
 {
 #if 0
-    // Simple version, good for code maintenance, but unfortunately slow for small inputs
-    void* state = XXH32_init(seed);
-    XXH32_update(state, input, len);
-    return XXH32_digest(state);
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+
 #else
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-#  if !defined(XXH_USE_UNALIGNED_ACCESS)
-    if ((((size_t)input) & 3))   // Input is aligned, let's leverage the speed advantage
-    {
-        if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-            return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
-        else
-            return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
-    }
-#  endif
 
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
-    else
-        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
 #endif
 }
 
 
-//****************************
-// Advanced Hash Functions
-//****************************
 
-struct XXH_state32_t
-{
-    U64 total_len;
-    U32 seed;
-    U32 v1;
-    U32 v2;
-    U32 v3;
-    U32 v4;
-    int memsize;
-    char memory[16];
-};
-
-
-int XXH32_sizeofState()
+/*======   Hash streaming   ======*/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
 {
-    XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t));   // A compilation error here means XXH32_SIZEOFSTATE is not large enough
-    return sizeof(struct XXH_state32_t);
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
 }
-
-
-XXH_errorcode XXH32_resetState(void* state_in, U32 seed)
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
 {
-    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
-    state->seed = seed;
-    state->v1 = seed + PRIME32_1 + PRIME32_2;
-    state->v2 = seed + PRIME32_2;
-    state->v3 = seed + 0;
-    state->v4 = seed - PRIME32_1;
-    state->total_len = 0;
-    state->memsize = 0;
+    XXH_free(statePtr);
     return XXH_OK;
 }
 
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
 
-void* XXH32_init (U32 seed)
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
 {
-    void* state = XXH_malloc (sizeof(struct XXH_state32_t));
-    XXH32_resetState(state, seed);
-    return state;
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
 }
 
 
-FORCE_INLINE XXH_errorcode XXH32_update_endian (void* state_in, const void* input, int len, XXH_endianess endian)
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
 {
-    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
-
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (input==NULL) return XXH_ERROR;
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
 #endif
 
-    state->total_len += len;
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
 
-    if (state->memsize + len < 16)   // fill in tmp buffer
-    {
-        XXH_memcpy(state->memory + state->memsize, input, len);
-        state->memsize +=  len;
-        return XXH_OK;
-    }
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
 
-    if (state->memsize)   // some data left from previous update
-    {
-        XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize);
-        {
-            const U32* p32 = (const U32*)state->memory;
-            state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++;
-            state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++;
-            state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++;
-            state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++;
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
         }
-        p += 16-state->memsize;
-        state->memsize = 0;
-    }
 
-    if (p <= bEnd-16)
-    {
-        const BYTE* const limit = bEnd - 16;
-        U32 v1 = state->v1;
-        U32 v2 = state->v2;
-        U32 v3 = state->v3;
-        U32 v4 = state->v4;
-
-        do
-        {
-            v1 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4;
-            v2 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4;
-            v3 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4;
-            v4 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4;
-        } while (p<=limit);
-
-        state->v1 = v1;
-        state->v2 = v2;
-        state->v3 = v3;
-        state->v4 = v4;
-    }
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        // uintptr_t casts added to avoid array-bounds error on
+        // some inlined calls
+        if ((uintptr_t)p <= (uintptr_t)bEnd - 16) {
+            const uintptr_t limit = (uintptr_t)bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while ((uintptr_t)p <= limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
 
-    if (p < bEnd)
-    {
-        XXH_memcpy(state->memory, p, bEnd-p);
-        state->memsize = (int)(bEnd-p);
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
     }
 
     return XXH_OK;
 }
 
-XXH_errorcode XXH32_update (void* state_in, const void* input, int len)
-{
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
-    else
-        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
-}
-
-
 
-FORCE_INLINE U32 XXH32_intermediateDigest_endian (void* state_in, XXH_endianess endian)
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
 {
-    struct XXH_state32_t * state = (struct XXH_state32_t *) state_in;
-    const BYTE * p = (const BYTE*)state->memory;
-    BYTE* bEnd = (BYTE*)state->memory + state->memsize;
-    U32 h32;
-
-    if (state->total_len >= 16)
-    {
-        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
-    }
-    else
-    {
-        h32  = state->seed + PRIME32_5;
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
     }
 
-    h32 += (U32) state->total_len;
+    h32 += state->total_len_32;
 
-    while (p<=bEnd-4)
-    {
-        h32 += XXH_readLE32((const U32*)p, endian) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
-        p+=4;
-    }
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
 
-    while (p<bEnd)
-    {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1;
-        p++;
-    }
 
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
-
-    return h32;
-}
+/*======   Canonical representation   ======*/
 
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, remaining comparable across different systems.
+*/
 
-U32 XXH32_intermediateDigest (void* state_in)
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
 {
-    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH32_intermediateDigest_endian(state_in, XXH_littleEndian);
-    else
-        return XXH32_intermediateDigest_endian(state_in, XXH_bigEndian);
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
 }
 
-
-U32 XXH32_digest (void* state_in)
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
 {
-    U32 h32 = XXH32_intermediateDigest(state_in);
+    return XXH_readBE32(src);
+}
 
-    XXH_free(state_in);
 
-    return h32;
-}
+#ifndef XXH_NO_LONG_LONG
 
 /* *******************************************************************
- *  64-bit hash functions
- *********************************************************************/
+*  64-bit hash functions
+*********************************************************************/
+
+/*======   Memory access   ======*/
+
+typedef XXH64_hash_t xxh_u64;
+
+
+/*! XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a performance gain
+ * on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit registers,
+ * and 64-bit arithmetic needs to be simulated, it isn't beneficial to unroll. The code becomes
+ * ridiculously large (the largest function in the binary on i386!), and rerolling it saves
+ * anywhere from 3kB to 20kB. It is also slightly faster because it fits into cache better
+ * and is more likely to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. */
+#ifndef XXH_REROLL_XXH64
+#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+#    define XXH_REROLL_XXH64 1
+#  else
+#    define XXH_REROLL_XXH64 0
+#  endif
+#endif /* !defined(XXH_REROLL_XXH64) */
 
- #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
 
- /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
- static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
 
- #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
 
- /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
- /* currently only defined for gcc and icc */
- typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64;
- static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
 
- #else
+#else
 
- /* portable and safe solution. Generally efficient.
-  * see : http://stackoverflow.com/a/32095106/646947
-  */
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
 
- static U64 XXH_read64(const void* memPtr)
- {
-     U64 val;
-     memcpy(&val, memPtr, sizeof(val));
-     return val;
- }
 #endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
-#if defined(_MSC_VER) /* Visual Studio */
-#define XXH_swap64 _byteswap_uint64
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
 #elif XXH_GCC_VERSION >= 403
-#define XXH_swap64 __builtin_bswap64
+#  define XXH_swap64 __builtin_bswap64
 #else
-static U64 XXH_swap64(U64 x) {
-  return ((x << 56) & 0xff00000000000000ULL) |
-         ((x << 40) & 0x00ff000000000000ULL) |
-         ((x << 24) & 0x0000ff0000000000ULL) |
-         ((x << 8) & 0x000000ff00000000ULL) |
-         ((x >> 8) & 0x00000000ff000000ULL) |
-         ((x >> 24) & 0x0000000000ff0000ULL) |
-         ((x >> 40) & 0x000000000000ff00ULL) |
-         ((x >> 56) & 0x00000000000000ffULL);
+static xxh_u64 XXH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
 }
 #endif
 
-FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian,
-                                    XXH_alignment align) {
-  if (align == XXH_unaligned)
-    return endian == XXH_littleEndian ? XXH_read64(ptr)
-                                      : XXH_swap64(XXH_read64(ptr));
-  else
-    return endian == XXH_littleEndian ? *(const U64*)ptr
-                                      : XXH_swap64(*(const U64*)ptr);
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
 }
 
-FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) {
-  return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
 }
 
-static U64 XXH_readBE64(const void* ptr) {
-  return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
 }
 
+
 /*======   xxh64   ======*/
 
-static const U64 PRIME64_1 =
-    11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
-                              */
-static const U64 PRIME64_2 =
-    14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
-                              */
-static const U64 PRIME64_3 =
-    1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
-                             */
-static const U64 PRIME64_4 =
-    9650029242287828579ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
-                             */
-static const U64 PRIME64_5 =
-    2870177450012600261ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
-                             */
-
-static U64 XXH64_round(U64 acc, U64 input) {
-  acc += input * PRIME64_2;
-  acc = XXH_rotl64(acc, 31);
-  acc *= PRIME64_1;
-  return acc;
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
 }
 
-static U64 XXH64_mergeRound(U64 acc, U64 val) {
-  val = XXH64_round(0, val);
-  acc ^= val;
-  acc = acc * PRIME64_1 + PRIME64_4;
-  return acc;
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
 }
 
-static U64 XXH64_avalanche(U64 h64) {
-  h64 ^= h64 >> 33;
-  h64 *= PRIME64_2;
-  h64 ^= h64 >> 29;
-  h64 *= PRIME64_3;
-  h64 ^= h64 >> 32;
-  return h64;
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
 }
 
-#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
-
-static U64 XXH64_finalize(U64 h64, const void* ptr, size_t len,
-                          XXH_endianess endian, XXH_alignment align) {
-  const BYTE* p = (const BYTE*)ptr;
-
-#define PROCESS1_64          \
-  h64 ^= (*p++) * PRIME64_5; \
-  h64 = XXH_rotl64(h64, 11) * PRIME64_1;
-
-#define PROCESS4_64                           \
-  h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
-  p += 4;                                     \
-  h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-
-#define PROCESS8_64                                    \
-  {                                                    \
-    U64 const k1 = XXH64_round(0, XXH_get64bits(p));   \
-    p += 8;                                            \
-    h64 ^= k1;                                         \
-    h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; \
-  }
-
-  switch (len & 31) {
-    case 24:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 16:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 8:
-      PROCESS8_64;
-      return XXH64_avalanche(h64);
-
-    case 28:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 20:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 12:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 4:
-      PROCESS4_64;
-      return XXH64_avalanche(h64);
-
-    case 25:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 17:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 9:
-      PROCESS8_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 29:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 21:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 13:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 5:
-      PROCESS4_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 26:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 18:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 10:
-      PROCESS8_64;
-      PROCESS1_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 30:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 22:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 14:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 6:
-      PROCESS4_64;
-      PROCESS1_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 27:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 19:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 11:
-      PROCESS8_64;
-      PROCESS1_64;
-      PROCESS1_64;
-      PROCESS1_64;
-      return XXH64_avalanche(h64);
-
-    case 31:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 23:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 15:
-      PROCESS8_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 7:
-      PROCESS4_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 3:
-      PROCESS1_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 2:
-      PROCESS1_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 1:
-      PROCESS1_64;
-      FALLTHROUGH_INTENDED;
-      /* fallthrough */
-    case 0:
-      return XXH64_avalanche(h64);
-  }
-
-  /* impossible to reach */
-  assert(0);
-  return 0; /* unreachable, but some compilers complain without it */
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1_64            \
+    h64 ^= (*ptr++) * PRIME64_5; \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64          \
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \
+    ptr+=4;                    \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64 {        \
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+    ptr+=8;                    \
+    h64 ^= k1;               \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
+}
+
+    /* Rerolled version for 32-bit targets is faster and much smaller. */
+    if (XXH_REROLL || XXH_REROLL_XXH64) {
+        len &= 31;
+        while (len >= 8) {
+            PROCESS8_64;
+            len -= 8;
+        }
+        if (len >= 4) {
+            PROCESS4_64;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1_64;
+            --len;
+        }
+         return  XXH64_avalanche(h64);
+    } else {
+        switch(len & 31) {
+           case 24: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 16: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  8: PROCESS8_64;
+                    return XXH64_avalanche(h64);
+
+           case 28: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 20: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 12: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  4: PROCESS4_64;
+                    return XXH64_avalanche(h64);
+
+           case 25: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 17: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  9: PROCESS8_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 29: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 21: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 13: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  5: PROCESS4_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 26: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 18: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 10: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 30: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 22: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 14: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  6: PROCESS4_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 27: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 19: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 11: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 31: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 23: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case 15: PROCESS8_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  7: PROCESS4_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  3: PROCESS1_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  2: PROCESS1_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  1: PROCESS1_64;
+                    FALLTHROUGH_INTENDED;
+                         /* fallthrough */
+           case  0: return XXH64_avalanche(h64);
+        }
+    }
+    /* impossible to reach */
+    XXH_ASSERT(0);
+    return 0;  /* unreachable, but some compilers complain without it */
 }
 
-FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed,
-                                    XXH_endianess endian, XXH_alignment align) {
-  const BYTE* p = (const BYTE*)input;
-  const BYTE* bEnd = p + len;
-  U64 h64;
-
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
-    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
-  if (p == NULL) {
-    len = 0;
-    bEnd = p = (const BYTE*)(size_t)32;
-  }
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
 #endif
 
-  if (len >= 32) {
-    const BYTE* const limit = bEnd - 32;
-    U64 v1 = seed + PRIME64_1 + PRIME64_2;
-    U64 v2 = seed + PRIME64_2;
-    U64 v3 = seed + 0;
-    U64 v4 = seed - PRIME64_1;
-
-    do {
-      v1 = XXH64_round(v1, XXH_get64bits(p));
-      p += 8;
-      v2 = XXH64_round(v2, XXH_get64bits(p));
-      p += 8;
-      v3 = XXH64_round(v3, XXH_get64bits(p));
-      p += 8;
-      v4 = XXH64_round(v4, XXH_get64bits(p));
-      p += 8;
-    } while (p <= limit);
-
-    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
-          XXH_rotl64(v4, 18);
-    h64 = XXH64_mergeRound(h64, v1);
-    h64 = XXH64_mergeRound(h64, v2);
-    h64 = XXH64_mergeRound(h64, v3);
-    h64 = XXH64_mergeRound(h64, v4);
-
-  } else {
-    h64 = seed + PRIME64_5;
-  }
-
-  h64 += (U64)len;
-
-  return XXH64_finalize(h64, p, len, endian, align);
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
+        xxh_u64 v2 = seed + PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
 }
 
-unsigned long long XXH64(const void* input, size_t len,
-                         unsigned long long seed) {
+
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
 #if 0
     /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
     XXH64_state_t state;
     XXH64_reset(&state, seed);
-    XXH64_update(&state, input, len);
+    XXH64_update(&state, (const xxh_u8*)input, len);
     return XXH64_digest(&state);
+
 #else
-  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-  if (XXH_FORCE_ALIGN_CHECK) {
-    if ((((size_t)input) & 7) ==
-        0) { /* Input is aligned, let's leverage the speed advantage */
-      if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-        return XXH64_endian_align(input, len, seed, XXH_littleEndian,
-                                  XXH_aligned);
-      else
-        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
-    }
-  }
 
-  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-    return XXH64_endian_align(input, len, seed, XXH_littleEndian,
-                              XXH_unaligned);
-  else
-    return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
 #endif
 }
 
 /*======   Hash Streaming   ======*/
 
-XXH64_state_t* XXH64_createState(void) {
-  return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
 }
-XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) {
-  XXH_free(statePtr);
-  return XXH_OK;
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
 }
 
-void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) {
-  memcpy(dstState, srcState, sizeof(*dstState));
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
 }
 
-XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) {
-  XXH64_state_t state; /* using a local state to memcpy() in order to avoid
-                          strict-aliasing warnings */
-  memset(&state, 0, sizeof(state));
-  state.v1 = seed + PRIME64_1 + PRIME64_2;
-  state.v2 = seed + PRIME64_2;
-  state.v3 = seed + 0;
-  state.v4 = seed - PRIME64_1;
-  /* do not write into reserved, planned to be removed in a future version */
-  memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
-  return XXH_OK;
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
 }
 
-FORCE_INLINE XXH_errorcode XXH64_update_endian(XXH64_state_t* state,
-                                               const void* input, size_t len,
-                                               XXH_endianess endian) {
-  if (input == NULL)
-#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
-    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
-    return XXH_OK;
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
 #else
-    return XXH_ERROR;
+        return XXH_ERROR;
 #endif
 
-  {
-    const BYTE* p = (const BYTE*)input;
-    const BYTE* const bEnd = p + len;
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
 
-    state->total_len += len;
+        state->total_len += len;
 
-    if (state->memsize + len < 32) { /* fill in tmp buffer */
-      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
-      state->memsize += (U32)len;
-      return XXH_OK;
-    }
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
 
-    if (state->memsize) { /* tmp buffer is full */
-      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input,
-                 32 - state->memsize);
-      state->v1 =
-          XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian));
-      state->v2 =
-          XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian));
-      state->v3 =
-          XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian));
-      state->v4 =
-          XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian));
-      p += 32 - state->memsize;
-      state->memsize = 0;
-    }
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
 
-    if (p + 32 <= bEnd) {
-      const BYTE* const limit = bEnd - 32;
-      U64 v1 = state->v1;
-      U64 v2 = state->v2;
-      U64 v3 = state->v3;
-      U64 v4 = state->v4;
-
-      do {
-        v1 = XXH64_round(v1, XXH_readLE64(p, endian));
-        p += 8;
-        v2 = XXH64_round(v2, XXH_readLE64(p, endian));
-        p += 8;
-        v3 = XXH64_round(v3, XXH_readLE64(p, endian));
-        p += 8;
-        v4 = XXH64_round(v4, XXH_readLE64(p, endian));
-        p += 8;
-      } while (p <= limit);
-
-      state->v1 = v1;
-      state->v2 = v2;
-      state->v3 = v3;
-      state->v4 = v4;
-    }
+        // uintptr_t casts added to avoid array-bounds error on
+        // some inlined calls
+        if ((uintptr_t)p + 32 <= (uintptr_t)bEnd) {
+            const uintptr_t limit = (uintptr_t)bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while ((uintptr_t)p <= limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
 
-    if (p < bEnd) {
-      XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
-      state->memsize = (unsigned)(bEnd - p);
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
     }
-  }
 
-  return XXH_OK;
+    return XXH_OK;
 }
 
-XXH_errorcode XXH64_update(XXH64_state_t* state_in, const void* input,
-                           size_t len) {
-  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
-
-  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-    return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
-  else
-    return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
-}
 
-FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state,
-                                     XXH_endianess endian) {
-  U64 h64;
-
-  if (state->total_len >= 32) {
-    U64 const v1 = state->v1;
-    U64 const v2 = state->v2;
-    U64 const v3 = state->v3;
-    U64 const v4 = state->v4;
-
-    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
-          XXH_rotl64(v4, 18);
-    h64 = XXH64_mergeRound(h64, v1);
-    h64 = XXH64_mergeRound(h64, v2);
-    h64 = XXH64_mergeRound(h64, v3);
-    h64 = XXH64_mergeRound(h64, v4);
-  } else {
-    h64 = state->v3 /*seed*/ + PRIME64_5;
-  }
-
-  h64 += (U64)state->total_len;
-
-  return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian,
-                        XXH_aligned);
-}
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
+    }
 
-unsigned long long XXH64_digest(const XXH64_state_t* state_in) {
-  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+    h64 += (xxh_u64) state->total_len;
 
-  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
-    return XXH64_digest_endian(state_in, XXH_littleEndian);
-  else
-    return XXH64_digest_endian(state_in, XXH_bigEndian);
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
 }
 
+
 /*====== Canonical representation   ======*/
 
-void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) {
-  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
-  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
-  memcpy(dst, &hash, sizeof(*dst));
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
 }
 
-XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) {
-  return XXH_readBE64(src);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
 }
-}  // namespace rocksdb
+
+
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3p.h" /* XXH3 preview for RocksDB */
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXHASH_C_01393879 */
diff --git a/util/xxhash.h b/util/xxhash.h
index 515375506e0..59d9b97d619 100644
--- a/util/xxhash.h
+++ b/util/xxhash.h
@@ -1,8 +1,12 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
 /*
-   xxHash - Fast Hash algorithm
+   xxHash - Extremely Fast Hash algorithm
    Header File
-   Copyright (C) 2012-2014, Yann Collet.
+   Copyright (C) 2012-2016, Yann Collet.
+
    BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
 
    Redistribution and use in source and binary forms, with or without
@@ -29,7 +33,7 @@
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    You can contact the author at :
-   - xxHash source repository : http://code.google.com/p/xxhash/
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
 /* Notice extracted from xxHash homepage :
@@ -49,193 +53,546 @@ Lookup3         1.2 GB/s      9       Bob Jenkins
 SuperFastHash   1.2 GB/s      1       Paul Hsieh
 CityHash64      1.05 GB/s    10       Pike & Alakuijala
 FNV             0.55 GB/s     5       Fowler, Noll, Vo
-CRC32           0.43 GB/s     9
+CRC32           0.43 GB/s #   9
 MD5-32          0.33 GB/s    10       Ronald L. Rivest
 SHA1-32         0.28 GB/s    10
 
+Note #: other CRC32 implementations can be over 40x faster than SMHasher's:
+http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
 Q.Score is a measure of quality of the hash function.
 It depends on successfully passing SMHasher test set.
 10 is a perfect score.
-*/
 
-#pragma once
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
 
-#include <stdlib.h>
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
 
-#if !defined(__VMS) &&       \
-    (defined(__cplusplus) || \
-     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-#include <stdint.h>
+/* BEGIN RocksDB customizations */
+#ifndef XXH_STATIC_LINKING_ONLY
+#define XXH_STATIC_LINKING_ONLY 1 /* access experimental APIs like XXH3 */
 #endif
+#define XXH_NAMESPACE ROCKSDB_
+/* END RocksDB customizations */
 
 #if defined (__cplusplus)
-namespace rocksdb {
+extern "C" {
 #endif
 
 
-//****************************
-// Type
-//****************************
-/* size_t */
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
 typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 
 
+/* ****************************
+ *  API modifier
+ ******************************/
+/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ *  This build macro includes xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining offers great performance improvement on small keys,
+ *  and dramatic ones when length is expressed as a compile-time constant.
+ *  See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html .
+ *  Methodology :
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate object.
+ */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+#else
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/*! XXH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
 
-//****************************
-// Simple Hash Functions
-//****************************
-
-unsigned int XXH32 (const void* input, int len, unsigned int seed);
 
-/*
-XXH32() :
-    Calculate the 32-bits hash of sequence of length "len" stored at memory address "input".
-    The memory between input & input+len must be valid (allocated and read-accessible).
-    "seed" can be used to alter the result predictably.
-    This function successfully passes all SMHasher tests.
-    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
-    Note that "len" is type "int", which means it is limited to 2^31-1.
-    If your data is larger, use the advanced functions below.
-*/
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
 
 
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform : need a 32-bit type"
+#     endif
+#   endif
+#endif
 
-//****************************
-// Advanced Hash Functions
-//****************************
+/*! XXH32() :
+    Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
 
-void*         XXH32_init   (unsigned int seed);
-XXH_errorcode XXH32_update (void* state, const void* input, int len);
-unsigned int  XXH32_digest (void* state);
+/*======   Streaming   ======*/
 
 /*
-These functions calculate the xxhash of an input provided in several small packets,
-as opposed to an input provided as a single block.
-
-It must be started with :
-void* XXH32_init()
-The function returns a pointer which holds the state of calculation.
-
-This pointer must be provided as "void* state" parameter for XXH32_update().
-XXH32_update() can be called as many times as necessary.
-The user must provide a valid (allocated) input.
-The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
-Note that "len" is type "int", which means it is limited to 2^31-1.
-If your data is larger, it is recommended to chunk your data into blocks
-of size for example 2^30 (1GB) to avoid any "int" overflow issue.
-
-Finally, you can end the calculation anytime, by using XXH32_digest().
-This function returns the final 32-bits hash.
-You must provide the same "void* state" parameter created by XXH32_init().
-Memory will be freed by XXH32_digest().
-*/
+ * Streaming functions generate the xxHash value from an incrememtal input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * XXH state must first be allocated, using XXH*_createState() .
+ *
+ * Start a new hash by initializing state with a seed, using XXH*_reset().
+ *
+ * Then, feed the hash state by calling XXH*_update() as many times as necessary.
+ * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a digest,
+ * and generate some new hash values later on, by invoking again XXH*_digest().
+ *
+ * When done, release the state, using XXH*_freeState().
+ */
+
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
 
+/*======   Canonical representation   ======*/
 
-int           XXH32_sizeofState();
-XXH_errorcode XXH32_resetState(void* state, unsigned int seed);
-
-#define       XXH32_SIZEOFSTATE 48
-typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t;
-/*
-These functions allow user application to make its own allocation for state.
-
-XXH32_sizeofState() is used to know how much space must be allocated for the xxHash 32-bits state.
-Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer.
-This pointer must then be provided as 'state' into XXH32_resetState(), which initializes the state.
+/* Default return values from XXH functions are basic unsigned 32 and 64 bits.
+ * This the simplest and fastest format for further post-processing.
+ * However, this leaves open the question of what is the order of bytes,
+ * since little and big endian conventions will write the same number differently.
+ *
+ * The canonical representation settles this issue,
+ * by mandating big-endian convention,
+ * aka, the same convention as human-readable numbers (large digits first).
+ * When writing hash values to storage, sending them over a network, or printing them,
+ * it's highly recommended to use the canonical representation,
+ * to ensure portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values into and from canonical format.
+ */
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t XXH64_hash_t;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef unsigned long long XXH64_hash_t;
+#endif
 
-For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()),
-use the structure XXH32_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields.
+/*! XXH64() :
+    Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
+    "seed" can be used to alter the result predictably.
+    This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
 */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
 
+/*======   Streaming   ======*/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
 
-unsigned int XXH32_intermediateDigest (void* state);
-/*
-This function does the same as XXH32_digest(), generating a 32-bit hash,
-but preserve memory context.
-This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXH32_update().
-To free memory context, use XXH32_digest(), or free().
-*/
-
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
 
+/*======   Canonical representation   ======*/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
 
-//****************************
-// Deprecated function names
-//****************************
-// The following translations are provided to ease code transition
-// You are encouraged to no longer this function names
-#define XXH32_feed   XXH32_update
-#define XXH32_result XXH32_digest
-#define XXH32_getIntermediateResult XXH32_intermediateDigest
 
-/*-**********************************************************************
- *  64-bit hash
- ************************************************************************/
-typedef unsigned long long XXH64_hash_t;
+#endif  /* XXH_NO_LONG_LONG */
 
-/*! XXH64() :
-    Calculate the 64-bit hash of sequence of length "len" stored at memory
-   address "input". "seed" can be used to alter the result predictably. This
-   function runs faster on 64-bit systems, but slower on 32-bit systems (see
-   benchmark).
-*/
-XXH64_hash_t XXH64(const void* input, size_t length, unsigned long long seed);
 
-/*======   Streaming   ======*/
-typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
-XXH64_state_t* XXH64_createState(void);
-XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
-void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
 
-XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed);
-XXH_errorcode XXH64_update(XXH64_state_t* statePtr, const void* input,
-                           size_t length);
-XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr);
+#ifdef XXH_STATIC_LINKING_ONLY
 
-/*======   Canonical representation   ======*/
-typedef struct {
-  unsigned char digest[8];
-} XXH64_canonical_t;
-void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
-XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+/* ================================================================================================
+   This section contains declarations which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   These declarations should only be used with static linking.
+   Never use them in association with dynamic linking !
+=================================================================================================== */
 
 /* These definitions are only present to allow
  * static allocation of XXH state, on stack or in a struct for example.
  * Never **ever** use members directly. */
 
-#if !defined(__VMS) &&       \
-    (defined(__cplusplus) || \
-     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
-
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+#ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
 struct XXH64_state_s {
-  uint64_t total_len;
-  uint64_t v1;
-  uint64_t v2;
-  uint64_t v3;
-  uint64_t v4;
-  uint64_t mem64[4];
-  uint32_t memsize;
-  uint32_t reserved[2]; /* never read nor write, might be removed in a future
-                           version */
-};                      /* typedef'd to XXH64_state_t */
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved32;  /* required for padding anyway */
+   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+#endif   /* XXH_NO_LONG_LONG */
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+#ifndef XXH_NO_LONG_LONG
+
+
+/* ============================================
+ * XXH3 is a new hash algorithm,
+ * featuring improved speed performance for both small and large inputs.
+ * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ * In general, expect XXH3 to run about ~2x faster on large inputs,
+ * and >3x faster on small ones, though exact differences depend on platform.
+ *
+ * The algorithm is portable, will generate the same hash on all platforms.
+ * It benefits greatly from vectorization units, but does not require it.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant :
+ * it reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The XXH3 algorithm is still considered experimental.
+ * Produced results can still change between versions.
+ * Results produced by v0.7.x are not comparable with results from v0.7.y .
+ * It's nonetheless possible to use XXH3 for ephemeral data (local sessions),
+ * but avoid storing values in long-term storage for later reads.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ *
+ * There are still a number of opened questions that community can influence during the experimental period.
+ * I'm trying to list a few of them below, though don't consider this list as complete.
+ *
+ * - 128-bits output type : currently defined as a structure of two 64-bits fields.
+ *                          That's because 128-bit values do not exist in C standard.
+ *                          Note that it means that, at byte level, result is not identical depending on endianess.
+ *                          However, at field level, they are identical on all platforms.
+ *                          The canonical representation solves the issue of identical byte-level representation across platforms,
+ *                          which is necessary for serialization.
+ *                          Q1 : Would there be a better representation for a 128-bit hash result ?
+ *                          Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ?
+ *
+ * - Prototype XXH128() :   XXH128() uses the same arguments as XXH64(), for consistency.
+ *                          It means it maps to XXH3p_128bits_withSeed().
+ *                          This variant is slightly slower than XXH3p_128bits(),
+ *                          because the seed is now part of the algorithm, and can't be simplified.
+ *                          Is that a good idea ?
+ *
+ * - Seed type for XXH128() : currently, it's a single 64-bit value, like the 64-bit variant.
+ *                          It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash.
+ *                          But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value.
+ *                          Such a variant could either replace current one, or become an additional one.
+ *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
+ *                          Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXH128 ?
+ *
+ * - Result for len==0 :    Currently, the result of hashing a zero-length input is always `0`.
+ *                          It seems okay as a return value when using "default" secret and seed.
+ *                          But is it still fine to return `0` when secret or seed are non-default ?
+ *                          Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ?
+ *
+ * - Consistency (1) :      Streaming XXH128 uses an XXH3 state, which is the same state as XXH3p_64bits().
+ *                          It means a 128bit streaming loop must invoke the following symbols :
+ *                          XXH3p_createState(), XXH3p_128bits_reset(), XXH3p_128bits_update() (loop), XXH3p_128bits_digest(), XXH3p_freeState().
+ *                          Is that consistent enough ?
+ *
+ * - Consistency (2) :      The canonical representation of `XXH3p_64bits` is provided by existing functions
+ *                          XXH64_canonicalFromHash(), and reverse operation XXH64_hashFromCanonical().
+ *                          As a mirror, canonical functions for XXH128_hash_t results generated by `XXH3p_128bits`
+ *                          are XXH128_canonicalFromHash() and XXH128_hashFromCanonical().
+ *                          Which means, `XXH3` doesn't appear in the names, because canonical functions operate on a type,
+ *                          independently of which algorithm was used to generate that type.
+ *                          Is that consistent enough ?
+ */
+
+#ifdef XXH_NAMESPACE
+#  define XXH3p_64bits XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits)
+#  define XXH3p_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_withSecret)
+#  define XXH3p_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_withSeed)
+
+#  define XXH3p_createState XXH_NAME2(XXH_NAMESPACE, XXH3p_createState)
+#  define XXH3p_freeState XXH_NAME2(XXH_NAMESPACE, XXH3p_freeState)
+#  define XXH3p_copyState XXH_NAME2(XXH_NAMESPACE, XXH3p_copyState)
+
+#  define XXH3p_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset)
+#  define XXH3p_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset_withSeed)
+#  define XXH3p_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_reset_withSecret)
+#  define XXH3p_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_update)
+#  define XXH3p_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3p_64bits_digest)
+#endif
 
+/* XXH3p_64bits() :
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits(const void* data, size_t len);
+
+/* XXH3p_64bits_withSecret() :
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The secret *must* be large enough (>= XXH3p_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid repeating same character, or sequences of bytes,
+ * and especially avoid swathes of \0.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3p_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/* XXH3p_64bits_withSeed() :
+ * This variant generates on the fly a custom secret,
+ * based on the default secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * note : seed==0 produces same results as XXH3p_64bits() */
+XXH_PUBLIC_API XXH64_hash_t XXH3p_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
 #else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
 
-#ifndef XXH_NO_LONG_LONG /* remove 64-bit support */
-struct XXH64_state_s {
-  unsigned long long total_len;
-  unsigned long long v1;
-  unsigned long long v2;
-  unsigned long long v3;
-  unsigned long long v4;
-  unsigned long long mem64[4];
-  unsigned memsize;
-  unsigned reserved[2]; /* never read nor write, might be removed in a future
-                           version */
-};                      /* typedef'd to XXH64_state_t */
+typedef struct XXH3p_state_s XXH3p_state_t;
+
+#define XXH3p_SECRET_DEFAULT_SIZE 192   /* minimum XXH3p_SECRET_SIZE_MIN */
+#define XXH3p_INTERNALBUFFER_SIZE 256
+struct XXH3p_state_s {
+   XXH_ALIGN(64) XXH64_hash_t acc[8];
+   XXH_ALIGN(64) unsigned char customSecret[XXH3p_SECRET_DEFAULT_SIZE];  /* used to store a custom secret generated from the seed. Makes state larger. Design might change */
+   XXH_ALIGN(64) unsigned char buffer[XXH3p_INTERNALBUFFER_SIZE];
+   XXH32_hash_t bufferedSize;
+   XXH32_hash_t nbStripesPerBlock;
+   XXH32_hash_t nbStripesSoFar;
+   XXH32_hash_t secretLimit;
+   XXH32_hash_t reserved32;
+   XXH32_hash_t reserved32_2;
+   XXH64_hash_t totalLen;
+   XXH64_hash_t seed;
+   XXH64_hash_t reserved64;
+   const unsigned char* secret;    /* note : there is some padding after, due to alignment on 64 bytes */
+};   /* typedef'd to XXH3p_state_t */
+
+/* Streaming requires state maintenance.
+ * This operation costs memory and cpu.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer using one-shot functions whenever possible. */
+
+XXH_PUBLIC_API XXH3p_state_t* XXH3p_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3p_freeState(XXH3p_state_t* statePtr);
+XXH_PUBLIC_API void XXH3p_copyState(XXH3p_state_t* dst_state, const XXH3p_state_t* src_state);
+
+
+/* XXH3p_64bits_reset() :
+ * initialize with default parameters.
+ * result will be equivalent to `XXH3p_64bits()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset(XXH3p_state_t* statePtr);
+/* XXH3p_64bits_reset_withSeed() :
+ * generate a custom secret from `seed`, and store it into state.
+ * digest will be equivalent to `XXH3p_64bits_withSeed()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed);
+/* XXH3p_64bits_reset_withSecret() :
+ * `secret` is referenced, and must outlive the hash streaming session.
+ * secretSize must be >= XXH3p_SECRET_SIZE_MIN.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_64bits_update (XXH3p_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3p_64bits_digest (const XXH3p_state_t* statePtr);
+
+
+/* 128-bit */
+
+#ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3p_128bits XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits)
+#  define XXH3p_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_withSeed)
+#  define XXH3p_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_withSecret)
+
+#  define XXH3p_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset)
+#  define XXH3p_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset_withSeed)
+#  define XXH3p_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_reset_withSecret)
+#  define XXH3p_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_update)
+#  define XXH3p_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3p_128bits_digest)
+
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
 #endif
 
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset(XXH3p_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset_withSeed(XXH3p_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_reset_withSecret(XXH3p_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3p_128bits_update (XXH3p_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3p_128bits_digest (const XXH3p_state_t* statePtr);
+
+
+/* Note : for better performance, following functions can be inlined,
+ * using XXH_INLINE_ALL */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/* This comparator is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*======   Canonical representation   ======*/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+
+/*-**********************************************************************
+*  XXH_INLINE_ALL
+************************************************************************/
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  include "xxhash.cc"   /* include xxhash function bodies as `static`, for inlining */
 #endif
 
+
+
+#endif /* XXH_STATIC_LINKING_ONLY */
+
+
 #if defined (__cplusplus)
-}  // namespace rocksdb
+}
 #endif
+
+#endif /* XXHASH_H_5627135585666179 */
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index b7c15c39150..e85e4625ad1 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -9,28 +9,10 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "rocksdb/utilities/backupable_db.h"
-#include "port/port.h"
-#include "rocksdb/rate_limiter.h"
-#include "rocksdb/transaction_log.h"
-#include "util/channel.h"
-#include "util/coding.h"
-#include "util/crc32c.h"
-#include "util/file_reader_writer.h"
-#include "util/filename.h"
-#include "util/logging.h"
-#include "util/string_util.h"
-#include "util/sync_point.h"
-#include "utilities/checkpoint/checkpoint_impl.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif  // __STDC_FORMAT_MACROS
-
-#include <inttypes.h>
 #include <stdlib.h>
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <functional>
 #include <future>
 #include <limits>
@@ -43,6 +25,21 @@
 #include <unordered_set>
 #include <vector>
 
+#include "file/filename.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "test_util/sync_point.h"
+#include "util/channel.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/string_util.h"
+#include "utilities/checkpoint/checkpoint_impl.h"
+
 namespace rocksdb {
 
 void BackupStatistics::IncrementNumberSuccessBackup() {
@@ -124,6 +121,7 @@ class BackupEngineImpl : public BackupEngine {
 
  private:
   void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0);
+  Status DeleteBackupInternal(BackupID backup_id);
 
   // Extends the "result" map with pathname->size mappings for the contents of
   // "dir" in "env". Pathnames are prefixed with "dir".
@@ -460,6 +458,10 @@ class BackupEngineImpl : public BackupEngine {
   std::mutex byte_report_mutex_;
   channel<CopyOrCreateWorkItem> files_to_copy_or_create_;
   std::vector<port::Thread> threads_;
+  // Certain operations like PurgeOldBackups and DeleteBackup will trigger
+  // automatic GarbageCollect (true) unless we've already done one in this
+  // session and have not failed to delete backup files since then (false).
+  bool might_need_garbage_collect_ = true;
 
   // Adds a file to the backup work queue to be copied or created if it doesn't
   // already exist.
@@ -563,6 +565,17 @@ Status BackupEngineImpl::Initialize() {
   options_.Dump(options_.info_log);
 
   if (!read_only_) {
+    // we might need to clean up from previous crash or I/O errors
+    might_need_garbage_collect_ = true;
+
+    if (options_.max_valid_backups_to_open != port::kMaxInt32) {
+      options_.max_valid_backups_to_open = port::kMaxInt32;
+      ROCKS_LOG_WARN(
+          options_.info_log,
+          "`max_valid_backups_to_open` is not set to the default value. Ignoring "
+          "its value since BackupEngine is not read-only.");
+    }
+
     // gather the list of directories that we need to create
     std::vector<std::pair<std::string, std::unique_ptr<Directory>*>>
         directories;
@@ -759,7 +772,11 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   if (s.ok()) {
     // maybe last backup failed and left partial state behind, clean it up.
     // need to do this before updating backups_ such that a private dir
-    // named after new_backup_id will be cleaned up
+    // named after new_backup_id will be cleaned up.
+    // (If an incomplete new backup is followed by an incomplete delete
+    // of the latest full backup, then there could be more than one next
+    // id with a private dir, the last thing to be deleted in delete
+    // backup, but all will be cleaned up with a GarbageCollect.)
     s = GarbageCollect();
   } else if (s.IsNotFound()) {
     // normal case, the new backup's private dir doesn't exist yet
@@ -932,8 +949,8 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
     ROCKS_LOG_INFO(options_.info_log, "Backup Statistics %s\n",
                    backup_statistics_.ToString().c_str());
     // delete files that we might have already written
+    might_need_garbage_collect_ = true;
     DeleteBackup(new_backup_id);
-    GarbageCollect();
     return s;
   }
 
@@ -961,6 +978,10 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
 Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
   assert(initialized_);
   assert(!read_only_);
+
+  // Best effort deletion even with errors
+  Status overall_status = Status::OK();
+
   ROCKS_LOG_INFO(options_.info_log, "Purging old backups, keeping %u",
                  num_backups_to_keep);
   std::vector<BackupID> to_delete;
@@ -970,17 +991,44 @@ Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
     itr++;
   }
   for (auto backup_id : to_delete) {
-    auto s = DeleteBackup(backup_id);
+    auto s = DeleteBackupInternal(backup_id);
     if (!s.ok()) {
-      return s;
+      overall_status = s;
     }
   }
-  return Status::OK();
+  // Clean up after any incomplete backup deletion, potentially from
+  // earlier session.
+  if (might_need_garbage_collect_) {
+    auto s = GarbageCollect();
+    if (!s.ok() && overall_status.ok()) {
+      overall_status = s;
+    }
+  }
+  return overall_status;
 }
 
 Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
+  auto s1 = DeleteBackupInternal(backup_id);
+  auto s2 = Status::OK();
+
+  // Clean up after any incomplete backup deletion, potentially from
+  // earlier session.
+  if (might_need_garbage_collect_) {
+    s2 = GarbageCollect();
+  }
+
+  if (!s1.ok()) {
+    return s1;
+  } else {
+    return s2;
+  }
+}
+
+// Does not auto-GarbageCollect
+Status BackupEngineImpl::DeleteBackupInternal(BackupID backup_id) {
   assert(initialized_);
   assert(!read_only_);
+
   ROCKS_LOG_INFO(options_.info_log, "Deleting backup %u", backup_id);
   auto backup = backups_.find(backup_id);
   if (backup != backups_.end()) {
@@ -1001,24 +1049,24 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
     corrupt_backups_.erase(corrupt);
   }
 
-  if (options_.max_valid_backups_to_open == port::kMaxInt32) {
-    std::vector<std::string> to_delete;
-    for (auto& itr : backuped_file_infos_) {
-      if (itr.second->refs == 0) {
-        Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
-        ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
-                       itr.first.c_str(), s.ToString().c_str());
-        to_delete.push_back(itr.first);
+  // After removing meta file, best effort deletion even with errors.
+  // (Don't delete other files if we can't delete the meta file right
+  // now.)
+  std::vector<std::string> to_delete;
+  for (auto& itr : backuped_file_infos_) {
+    if (itr.second->refs == 0) {
+      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
+      ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
+                     s.ToString().c_str());
+      to_delete.push_back(itr.first);
+      if (!s.ok()) {
+        // Trying again later might work
+        might_need_garbage_collect_ = true;
       }
     }
-    for (auto& td : to_delete) {
-      backuped_file_infos_.erase(td);
-    }
-  } else {
-    ROCKS_LOG_WARN(
-        options_.info_log,
-        "DeleteBackup cleanup is limited since `max_valid_backups_to_open` "
-        "constrains how many backups the engine knows about");
+  }
+  for (auto& td : to_delete) {
+    backuped_file_infos_.erase(td);
   }
 
   // take care of private dirs -- GarbageCollect() will take care of them
@@ -1027,6 +1075,10 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
   Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir));
   ROCKS_LOG_INFO(options_.info_log, "Deleting private dir %s -- %s",
                  private_dir.c_str(), s.ToString().c_str());
+  if (!s.ok()) {
+    // Full gc or trying again later might work
+    might_need_garbage_collect_ = true;
+  }
   return Status::OK();
 }
 
@@ -1509,23 +1561,21 @@ Status BackupEngineImpl::InsertPathnameToSizeBytes(
 
 Status BackupEngineImpl::GarbageCollect() {
   assert(!read_only_);
+
+  // We will make a best effort to remove all garbage even in the presence
+  // of inconsistencies or I/O failures that inhibit finding garbage.
+  Status overall_status = Status::OK();
+  // If all goes well, we don't need another auto-GC this session
+  might_need_garbage_collect_ = false;
+
   ROCKS_LOG_INFO(options_.info_log, "Starting garbage collection");
-  if (options_.max_valid_backups_to_open == port::kMaxInt32) {
-    ROCKS_LOG_WARN(
-        options_.info_log,
-        "Garbage collection is limited since `max_valid_backups_to_open` "
-        "constrains how many backups the engine knows about");
-  }
 
-  if (options_.share_table_files &&
-      options_.max_valid_backups_to_open == port::kMaxInt32) {
-    // delete obsolete shared files
-    // we cannot do this when BackupEngine has `max_valid_backups_to_open` set
-    // as those engines don't know about all shared files.
+  // delete obsolete shared files
+  for (bool with_checksum : {false, true}) {
     std::vector<std::string> shared_children;
     {
       std::string shared_path;
-      if (options_.share_files_with_checksum) {
+      if (with_checksum) {
         shared_path = GetAbsolutePath(GetSharedFileWithChecksumRel());
       } else {
         shared_path = GetAbsolutePath(GetSharedFileRel());
@@ -1537,12 +1587,17 @@ Status BackupEngineImpl::GarbageCollect() {
         s = Status::OK();
       }
       if (!s.ok()) {
-        return s;
+        overall_status = s;
+        // Trying again later might work
+        might_need_garbage_collect_ = true;
       }
     }
     for (auto& child : shared_children) {
+      if (child == "." || child == "..") {
+        continue;
+      }
       std::string rel_fname;
-      if (options_.share_files_with_checksum) {
+      if (with_checksum) {
         rel_fname = GetSharedFileWithChecksumRel(child);
       } else {
         rel_fname = GetSharedFileRel(child);
@@ -1557,6 +1612,10 @@ Status BackupEngineImpl::GarbageCollect() {
         ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
                        rel_fname.c_str(), s.ToString().c_str());
         backuped_file_infos_.erase(rel_fname);
+        if (!s.ok()) {
+          // Trying again later might work
+          might_need_garbage_collect_ = true;
+        }
       }
     }
   }
@@ -1567,12 +1626,16 @@ Status BackupEngineImpl::GarbageCollect() {
     auto s = backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
                                       &private_children);
     if (!s.ok()) {
-      return s;
+      overall_status = s;
+      // Trying again later might work
+      might_need_garbage_collect_ = true;
     }
   }
   for (auto& child : private_children) {
-    // it's ok to do this when BackupEngine has `max_valid_backups_to_open` set
-    // as the engine always knows all valid backup numbers.
+    if (child == "." || child == "..") {
+      continue;
+    }
+
     BackupID backup_id = 0;
     bool tmp_dir = child.find(".tmp") != std::string::npos;
     sscanf(child.c_str(), "%u", &backup_id);
@@ -1587,18 +1650,30 @@ Status BackupEngineImpl::GarbageCollect() {
     std::vector<std::string> subchildren;
     backup_env_->GetChildren(full_private_path, &subchildren);
     for (auto& subchild : subchildren) {
+      if (subchild == "." || subchild == "..") {
+        continue;
+      }
       Status s = backup_env_->DeleteFile(full_private_path + subchild);
       ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
                      (full_private_path + subchild).c_str(),
                      s.ToString().c_str());
+      if (!s.ok()) {
+        // Trying again later might work
+        might_need_garbage_collect_ = true;
+      }
     }
     // finally delete the private dir
     Status s = backup_env_->DeleteDir(full_private_path);
     ROCKS_LOG_INFO(options_.info_log, "Deleting dir %s -- %s",
                    full_private_path.c_str(), s.ToString().c_str());
+    if (!s.ok()) {
+      // Trying again later might work
+      might_need_garbage_collect_ = true;
+    }
   }
 
-  return Status::OK();
+  assert(overall_status.ok() || might_need_garbage_collect_);
+  return overall_status;
 }
 
 // ------- BackupMeta class --------
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index 1548203dd0a..98aad78e453 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -10,10 +10,13 @@
 #if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
 
 #include <algorithm>
+#include <limits>
 #include <string>
+#include <utility>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "env/env_chroot.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/rate_limiter.h"
@@ -21,15 +24,13 @@
 #include "rocksdb/types.h"
 #include "rocksdb/utilities/backupable_db.h"
 #include "rocksdb/utilities/options_util.h"
-#include "util/file_reader_writer.h"
-#include "util/filename.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -516,6 +517,15 @@ static void AssertEmpty(DB* db, int from, int to) {
 
 class BackupableDBTest : public testing::Test {
  public:
+  enum ShareOption {
+    kNoShare,
+    kShareNoChecksum,
+    kShareWithChecksum,
+  };
+
+  const std::vector<ShareOption> kAllShareOptions = {
+      kNoShare, kShareNoChecksum, kShareWithChecksum};
+
   BackupableDBTest() {
     // set up files
     std::string db_chroot = test::PerThreadDBPath("backupable_db");
@@ -561,15 +571,8 @@ class BackupableDBTest : public testing::Test {
     return db;
   }
 
-  void OpenDBAndBackupEngineShareWithChecksum(
-      bool destroy_old_data = false, bool dummy = false,
-      bool /*share_table_files*/ = true, bool share_with_checksums = false) {
-    backupable_options_->share_files_with_checksum = share_with_checksums;
-    OpenDBAndBackupEngine(destroy_old_data, dummy, share_with_checksums);
-  }
-
   void OpenDBAndBackupEngine(bool destroy_old_data = false, bool dummy = false,
-                             bool share_table_files = true) {
+                             ShareOption shared_option = kShareNoChecksum) {
     // reset all the defaults
     test_backup_env_->SetLimitWrittenFiles(1000000);
     test_db_env_->SetLimitWrittenFiles(1000000);
@@ -584,7 +587,9 @@ class BackupableDBTest : public testing::Test {
     }
     db_.reset(db);
     backupable_options_->destroy_old_data = destroy_old_data;
-    backupable_options_->share_table_files = share_table_files;
+    backupable_options_->share_table_files = shared_option != kNoShare;
+    backupable_options_->share_files_with_checksum =
+        shared_option == kShareWithChecksum;
     BackupEngine* backup_engine;
     ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
                                  &backup_engine));
@@ -839,7 +844,7 @@ INSTANTIATE_TEST_CASE_P(BackupableDBTestWithParam, BackupableDBTestWithParam,
                         ::testing::Bool());
 
 // this will make sure that backup does not copy the same file twice
-TEST_F(BackupableDBTest, NoDoubleCopy) {
+TEST_F(BackupableDBTest, NoDoubleCopy_And_AutoGC) {
   OpenDBAndBackupEngine(true, true);
 
   // should write 5 DB files + one meta file
@@ -857,23 +862,30 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   AppendPath(backupdir_, should_have_written);
   test_backup_env_->AssertWrittenFiles(should_have_written);
 
-  // should write 4 new DB files + one meta file
-  // should not write/copy 00010.sst, since it's already there!
-  test_backup_env_->SetLimitWrittenFiles(6);
-  test_backup_env_->ClearWrittenFiles();
+  char db_number = '1';
 
-  dummy_db_->live_files_ = {"/00010.sst", "/00015.sst", "/CURRENT",
-                            "/MANIFEST-01"};
-  dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
-  test_db_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
-  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
-  // should not open 00010.sst - it's already there
+  for (std::string other_sst : {"00015.sst", "00017.sst", "00019.sst"}) {
+    // should write 4 new DB files + one meta file
+    // should not write/copy 00010.sst, since it's already there!
+    test_backup_env_->SetLimitWrittenFiles(6);
+    test_backup_env_->ClearWrittenFiles();
 
-  should_have_written = {"/shared/.00015.sst.tmp", "/private/2/CURRENT",
-                         "/private/2/MANIFEST-01", "/private/2/00011.log",
-                         "/meta/.2.tmp"};
-  AppendPath(backupdir_, should_have_written);
-  test_backup_env_->AssertWrittenFiles(should_have_written);
+    dummy_db_->live_files_ = {"/00010.sst", "/" + other_sst, "/CURRENT",
+                              "/MANIFEST-01"};
+    dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
+    test_db_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+    // should not open 00010.sst - it's already there
+
+    ++db_number;
+    std::string private_dir = std::string("/private/") + db_number;
+    should_have_written = {
+        "/shared/." + other_sst + ".tmp", private_dir + "/CURRENT",
+        private_dir + "/MANIFEST-01", private_dir + "/00011.log",
+        std::string("/meta/.") + db_number + ".tmp"};
+    AppendPath(backupdir_, should_have_written);
+    test_backup_env_->AssertWrittenFiles(should_have_written);
+  }
 
   ASSERT_OK(backup_engine_->DeleteBackup(1));
   ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
@@ -890,6 +902,42 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size);
   ASSERT_EQ(200UL, size);
 
+  CloseBackupEngine();
+
+  //
+  // Now simulate incomplete delete by removing just meta
+  //
+  ASSERT_OK(test_backup_env_->DeleteFile(backupdir_ + "/meta/2"));
+
+  OpenBackupEngine();
+
+  // 1 appears to be removed, so
+  // 2 non-corrupt and 0 corrupt seen
+  std::vector<BackupInfo> backup_info;
+  std::vector<BackupID> corrupt_backup_ids;
+  backup_engine_->GetBackupInfo(&backup_info);
+  backup_engine_->GetCorruptedBackups(&corrupt_backup_ids);
+  ASSERT_EQ(2UL, backup_info.size());
+  ASSERT_EQ(0UL, corrupt_backup_ids.size());
+
+  // Keep the two we see, but this should suffice to purge unreferenced
+  // shared files from incomplete delete.
+  ASSERT_OK(backup_engine_->PurgeOldBackups(2));
+
+  // Make sure dangling sst file has been removed (somewhere along this
+  // process). GarbageCollect should not be needed.
+  ASSERT_EQ(Status::NotFound(),
+            test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst"));
+
+  // Now actually purge a good one
+  ASSERT_OK(backup_engine_->PurgeOldBackups(1));
+
+  ASSERT_EQ(Status::NotFound(),
+            test_backup_env_->FileExists(backupdir_ + "/shared/00017.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00019.sst"));
+
   CloseDBAndBackupEngine();
 }
 
@@ -972,7 +1020,8 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   ASSERT_OK(backup_engine_->DeleteBackup(4));
   ASSERT_OK(backup_engine_->DeleteBackup(3));
   ASSERT_OK(backup_engine_->DeleteBackup(2));
-  (void)backup_engine_->GarbageCollect();
+  // Should not be needed anymore with auto-GC on DeleteBackup
+  //(void)backup_engine_->GarbageCollect();
   ASSERT_EQ(Status::NotFound(),
             file_manager_->FileExists(backupdir_ + "/meta/5"));
   ASSERT_EQ(Status::NotFound(),
@@ -1161,7 +1210,7 @@ TEST_F(BackupableDBTest, FailOverwritingBackups) {
 
 TEST_F(BackupableDBTest, NoShareTableFiles) {
   const int keys_iteration = 5000;
-  OpenDBAndBackupEngine(true, false, false);
+  OpenDBAndBackupEngine(true, false, kNoShare);
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
@@ -1177,7 +1226,7 @@ TEST_F(BackupableDBTest, NoShareTableFiles) {
 // Verify that you can backup and restore with share_files_with_checksum on
 TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
   const int keys_iteration = 5000;
-  OpenDBAndBackupEngineShareWithChecksum(true, false, true, true);
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
@@ -1195,7 +1244,7 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
 TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
   const int keys_iteration = 5000;
   // set share_files_with_checksum to false
-  OpenDBAndBackupEngineShareWithChecksum(true, false, true, false);
+  OpenDBAndBackupEngine(true, false, kShareNoChecksum);
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
@@ -1208,55 +1257,108 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
   }
 
   // set share_files_with_checksum to true and do some more backups
-  OpenDBAndBackupEngineShareWithChecksum(true, false, true, true);
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
   for (int i = 5; i < 10; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
   }
   CloseDBAndBackupEngine();
 
-  for (int i = 0; i < 5; ++i) {
-    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 5 + 1),
+  // Verify first (about to delete)
+  AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 11);
+
+  // For an extra challenge, make sure that GarbageCollect / DeleteBackup
+  // is OK even if we open without share_table_files
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
+  backup_engine_->DeleteBackup(1);
+  backup_engine_->GarbageCollect();
+  CloseDBAndBackupEngine();
+
+  // Verify rest (not deleted)
+  for (int i = 1; i < 10; ++i) {
+    AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
                             keys_iteration * 11);
   }
 }
 
+// This test simulates cleaning up after aborted or incomplete creation
+// of a new backup.
 TEST_F(BackupableDBTest, DeleteTmpFiles) {
-  for (bool shared_checksum : {false, true}) {
-    if (shared_checksum) {
-      OpenDBAndBackupEngineShareWithChecksum(
-          false /* destroy_old_data */, false /* dummy */,
-          true /* share_table_files */, true /* share_with_checksums */);
-    } else {
-      OpenDBAndBackupEngine();
-    }
-    CloseDBAndBackupEngine();
-    std::string shared_tmp = backupdir_;
-    if (shared_checksum) {
-      shared_tmp += "/shared_checksum";
-    } else {
-      shared_tmp += "/shared";
-    }
-    shared_tmp += "/.00006.sst.tmp";
-    std::string private_tmp_dir = backupdir_ + "/private/10";
-    std::string private_tmp_file = private_tmp_dir + "/00003.sst";
-    file_manager_->WriteToFile(shared_tmp, "tmp");
-    file_manager_->CreateDir(private_tmp_dir);
-    file_manager_->WriteToFile(private_tmp_file, "tmp");
-    ASSERT_OK(file_manager_->FileExists(private_tmp_dir));
-    if (shared_checksum) {
-      OpenDBAndBackupEngineShareWithChecksum(
-          false /* destroy_old_data */, false /* dummy */,
-          true /* share_table_files */, true /* share_with_checksums */);
-    } else {
-      OpenDBAndBackupEngine();
+  for (int cleanup_fn : {1, 2, 3, 4}) {
+    for (ShareOption shared_option : kAllShareOptions) {
+      OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+                            shared_option);
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+      BackupID next_id = 1;
+      BackupID oldest_id = std::numeric_limits<BackupID>::max();
+      {
+        std::vector<BackupInfo> backup_info;
+        backup_engine_->GetBackupInfo(&backup_info);
+        for (const auto& bi : backup_info) {
+          next_id = std::max(next_id, bi.backup_id + 1);
+          oldest_id = std::min(oldest_id, bi.backup_id);
+        }
+      }
+      CloseDBAndBackupEngine();
+
+      // An aborted or incomplete new backup will always be in the next
+      // id (maybe more)
+      std::string next_private = "private/" + std::to_string(next_id);
+
+      // NOTE: both shared and shared_checksum should be cleaned up
+      // regardless of how the backup engine is opened.
+      std::vector<std::string> tmp_files_and_dirs;
+      for (const auto& dir_and_file : {
+               std::make_pair(std::string("shared"),
+                              std::string(".00006.sst.tmp")),
+               std::make_pair(std::string("shared_checksum"),
+                              std::string(".00007.sst.tmp")),
+               std::make_pair(next_private, std::string("00003.sst")),
+           }) {
+        std::string dir = backupdir_ + "/" + dir_and_file.first;
+        file_manager_->CreateDir(dir);
+        ASSERT_OK(file_manager_->FileExists(dir));
+
+        std::string file = dir + "/" + dir_and_file.second;
+        file_manager_->WriteToFile(file, "tmp");
+        ASSERT_OK(file_manager_->FileExists(file));
+
+        tmp_files_and_dirs.push_back(file);
+      }
+      if (cleanup_fn != /*CreateNewBackup*/ 4) {
+        // This exists after CreateNewBackup because it's deleted then
+        // re-created.
+        tmp_files_and_dirs.push_back(backupdir_ + "/" + next_private);
+      }
+
+      OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
+                            shared_option);
+      // Need to call one of these explicitly to delete tmp files
+      switch (cleanup_fn) {
+        case 1:
+          ASSERT_OK(backup_engine_->GarbageCollect());
+          break;
+        case 2:
+          ASSERT_OK(backup_engine_->DeleteBackup(oldest_id));
+          break;
+        case 3:
+          ASSERT_OK(backup_engine_->PurgeOldBackups(1));
+          break;
+        case 4:
+          // Does a garbage collect if it sees that next private dir exists
+          ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+          break;
+        default:
+          assert(false);
+      }
+      CloseDBAndBackupEngine();
+      for (std::string file_or_dir : tmp_files_and_dirs) {
+        if (file_manager_->FileExists(file_or_dir) != Status::NotFound()) {
+          FAIL() << file_or_dir << " was expected to be deleted." << cleanup_fn;
+        }
+      }
     }
-    // Need to call this explicitly to delete tmp files
-    (void)backup_engine_->GarbageCollect();
-    CloseDBAndBackupEngine();
-    ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(shared_tmp));
-    ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(private_tmp_file));
-    ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(private_tmp_dir));
   }
 }
 
@@ -1585,12 +1687,50 @@ TEST_F(BackupableDBTest, LimitBackupsOpened) {
   CloseDBAndBackupEngine();
 
   backupable_options_->max_valid_backups_to_open = 2;
-  OpenDBAndBackupEngine();
+  backupable_options_->destroy_old_data = false;
+  BackupEngineReadOnly* read_only_backup_engine;
+  ASSERT_OK(BackupEngineReadOnly::Open(backup_chroot_env_.get(),
+                                       *backupable_options_,
+                                       &read_only_backup_engine));
+
   std::vector<BackupInfo> backup_infos;
-  backup_engine_->GetBackupInfo(&backup_infos);
+  read_only_backup_engine->GetBackupInfo(&backup_infos);
   ASSERT_EQ(2, backup_infos.size());
   ASSERT_EQ(2, backup_infos[0].backup_id);
   ASSERT_EQ(4, backup_infos[1].backup_id);
+  delete read_only_backup_engine;
+}
+
+TEST_F(BackupableDBTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) {
+  // Verify the specified max_valid_backups_to_open is ignored if the engine
+  // is not read-only.
+  //
+  // Setup:
+  // - backups 1, 2, and 4 are valid
+  // - backup 3 is corrupt
+  // - max_valid_backups_to_open == 2
+  //
+  // Expectation: the engine opens backups 4, 2, and 1 since those are latest
+  // non-corrupt backups, by ignoring max_valid_backups_to_open == 2.
+  const int kNumKeys = 5000;
+  OpenDBAndBackupEngine(true);
+  for (int i = 1; i <= 4; ++i) {
+    FillDB(db_.get(), kNumKeys * i, kNumKeys * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+    if (i == 3) {
+      ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/3", 3));
+    }
+  }
+  CloseDBAndBackupEngine();
+
+  backupable_options_->max_valid_backups_to_open = 2;
+  OpenDBAndBackupEngine();
+  std::vector<BackupInfo> backup_infos;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(3, backup_infos.size());
+  ASSERT_EQ(1, backup_infos[0].backup_id);
+  ASSERT_EQ(2, backup_infos[1].backup_id);
+  ASSERT_EQ(4, backup_infos[2].backup_id);
   CloseDBAndBackupEngine();
   DestroyDB(dbname_, options_);
 }
@@ -1616,33 +1756,6 @@ TEST_F(BackupableDBTest, CreateWhenLatestBackupCorrupted) {
   ASSERT_EQ(2, backup_infos[0].backup_id);
 }
 
-TEST_F(BackupableDBTest, WriteOnlyEngine) {
-  // Verify we can open a backup engine and create new ones even if reading old
-  // backups would fail with IOError. IOError is a more serious condition than
-  // corruption and would cause the engine to fail opening. So the only way to
-  // avoid is by not reading old backups at all, i.e., respecting
-  // `max_valid_backups_to_open == 0`.
-  const int kNumKeys = 5000;
-  OpenDBAndBackupEngine(true /* destroy_old_data */);
-  FillDB(db_.get(), 0 /* from */, kNumKeys);
-  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
-  CloseDBAndBackupEngine();
-
-  backupable_options_->max_valid_backups_to_open = 0;
-  // cause any meta-file reads to fail with IOError during Open
-  test_backup_env_->SetDummySequentialFile(true);
-  test_backup_env_->SetDummySequentialFileFailReads(true);
-  OpenDBAndBackupEngine();
-  test_backup_env_->SetDummySequentialFileFailReads(false);
-  test_backup_env_->SetDummySequentialFile(false);
-
-  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
-  std::vector<BackupInfo> backup_infos;
-  backup_engine_->GetBackupInfo(&backup_infos);
-  ASSERT_EQ(1, backup_infos.size());
-  ASSERT_EQ(2, backup_infos[0].backup_id);
-}
-
 TEST_F(BackupableDBTest, WriteOnlyEngineNoSharedFileDeletion) {
   // Verifies a write-only BackupEngine does not delete files belonging to valid
   // backups when GarbageCollect, PurgeOldBackups, or DeleteBackup are called.
diff --git a/utilities/blob_db/blob_compaction_filter.h b/utilities/blob_db/blob_compaction_filter.h
index 7a8ea613573..5555d6f72a4 100644
--- a/utilities/blob_db/blob_compaction_filter.h
+++ b/utilities/blob_db/blob_compaction_filter.h
@@ -7,11 +7,11 @@
 
 #include <unordered_set>
 
+#include "db/blob_index.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
 #include "utilities/blob_db/blob_db_impl.h"
-#include "utilities/blob_db/blob_index.h"
 
 namespace rocksdb {
 namespace blob_db {
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index d660def4908..bee36a667a2 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -5,13 +5,9 @@
 //
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/blob_db/blob_db.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace rocksdb {
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 8a7f7e0842a..d6ebca2db48 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -204,6 +204,28 @@ class BlobDB : public StackableDB {
     return NewIterator(options);
   }
 
+  Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override = 0;
+  Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override {
+    if (column_family != DefaultColumnFamily()) {
+      return Status::NotSupported(
+          "Blob DB doesn't support non-default column family.");
+    }
+
+    return CompactFiles(compact_options, input_file_names, output_level,
+                        output_path_id, output_file_names, compaction_job_info);
+  }
+
   using rocksdb::StackableDB::Close;
   virtual Status Close() override = 0;
 
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 5dcddc214c8..9c4a8a4c5ad 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1,3 +1,4 @@
+
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
@@ -9,9 +10,17 @@
 #include <cinttypes>
 #include <iomanip>
 #include <memory>
+#include <sstream>
 
-#include "db/db_impl.h"
+#include "db/blob_index.h"
+#include "db/db_impl/db_impl.h"
 #include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
 #include "monitoring/instrumented_mutex.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/convenience.h"
@@ -19,26 +28,20 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/utilities/stackable_db.h"
 #include "rocksdb/utilities/transaction.h"
-#include "table/block.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_builder.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_builder.h"
 #include "table/meta_blocks.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
-#include "util/file_util.h"
-#include "util/filename.h"
-#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_compaction_filter.h"
 #include "utilities/blob_db/blob_db_iterator.h"
 #include "utilities/blob_db/blob_db_listener.h"
-#include "utilities/blob_db/blob_index.h"
 
 namespace {
 int kBlockBasedTableVersionFormat = 2;
@@ -80,7 +83,7 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       env_options_(db_options),
       statistics_(db_options_.statistics.get()),
       next_file_number_(1),
-      epoch_of_(0),
+      flush_sequence_(0),
       closed_(true),
       open_file_count_(0),
       total_blob_size_(0),
@@ -135,6 +138,18 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
       cf_options_.compaction_filter_factory != nullptr) {
     return Status::NotSupported("Blob DB doesn't support compaction filter.");
   }
+  // BlobDB does not support Periodic Compactions. So disable periodic
+  // compactions irrespective of the user set value.
+  cf_options_.periodic_compaction_seconds = 0;
+
+  // Temporarily disable compactions in the base DB during open; save the user
+  // defined value beforehand so we can restore it once BlobDB is initialized.
+  // Note: this is only needed if garbage collection is enabled.
+  const bool disable_auto_compactions = cf_options_.disable_auto_compactions;
+
+  if (bdb_options_.enable_garbage_collection) {
+    cf_options_.disable_auto_compactions = true;
+  }
 
   Status s;
 
@@ -170,7 +185,9 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
   }
 
   // Update options
-  db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this));
+  db_options_.listeners.push_back(bdb_options_.enable_garbage_collection
+                                      ? std::make_shared<BlobDBListenerGC>(this)
+                                      : std::make_shared<BlobDBListener>(this));
   cf_options_.compaction_filter_factory.reset(
       new BlobIndexCompactionFilterFactory(this, env_, statistics_));
 
@@ -182,6 +199,28 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
   }
   db_impl_ = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
 
+  // Initialize SST file <-> oldest blob file mapping if garbage collection
+  // is enabled.
+  if (bdb_options_.enable_garbage_collection) {
+    std::vector<LiveFileMetaData> live_files;
+    db_->GetLiveFilesMetaData(&live_files);
+
+    InitializeBlobFileToSstMapping(live_files);
+
+    MarkUnreferencedBlobFilesObsoleteDuringOpen();
+
+    if (!disable_auto_compactions) {
+      s = db_->EnableAutoCompaction(*handles);
+      if (!s.ok()) {
+        ROCKS_LOG_ERROR(
+            db_options_.info_log,
+            "Failed to enable automatic compactions during open, status: %s",
+            s.ToString().c_str());
+        return s;
+      }
+    }
+  }
+
   // Add trash files in blob dir to file delete scheduler.
   SstFileManagerImpl* sfm = static_cast<SstFileManagerImpl*>(
       db_impl_->immutable_db_options().sst_file_manager.get());
@@ -252,23 +291,25 @@ Status BlobDBImpl::OpenAllBlobFiles() {
     next_file_number_.store(*file_numbers.rbegin() + 1);
   }
 
-  std::string blob_file_list;
-  std::string obsolete_file_list;
+  std::ostringstream blob_file_oss;
+  std::ostringstream live_imm_oss;
+  std::ostringstream obsolete_file_oss;
 
   for (auto& file_number : file_numbers) {
     std::shared_ptr<BlobFile> blob_file = std::make_shared<BlobFile>(
         this, blob_dir_, file_number, db_options_.info_log.get());
-    blob_file->MarkImmutable();
+    blob_file->MarkImmutable(/* sequence */ 0);
 
     // Read file header and footer
     Status read_metadata_status = blob_file->ReadMetadata(env_, env_options_);
     if (read_metadata_status.IsCorruption()) {
       // Remove incomplete file.
-      ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/);
-      if (!obsolete_file_list.empty()) {
-        obsolete_file_list.append(", ");
+      if (!obsolete_files_.empty()) {
+        obsolete_file_oss << ", ";
       }
-      obsolete_file_list.append(ToString(file_number));
+      obsolete_file_oss << file_number;
+
+      ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/);
       continue;
     } else if (!read_metadata_status.ok()) {
       ROCKS_LOG_ERROR(db_options_.info_log,
@@ -280,23 +321,249 @@ Status BlobDBImpl::OpenAllBlobFiles() {
 
     total_blob_size_ += blob_file->GetFileSize();
 
+    if (!blob_files_.empty()) {
+      blob_file_oss << ", ";
+    }
+    blob_file_oss << file_number;
+
     blob_files_[file_number] = blob_file;
-    if (!blob_file_list.empty()) {
-      blob_file_list.append(", ");
+
+    if (!blob_file->HasTTL()) {
+      if (!live_imm_non_ttl_blob_files_.empty()) {
+        live_imm_oss << ", ";
+      }
+      live_imm_oss << file_number;
+
+      live_imm_non_ttl_blob_files_[file_number] = blob_file;
     }
-    blob_file_list.append(ToString(file_number));
   }
 
   ROCKS_LOG_INFO(db_options_.info_log,
                  "Found %" ROCKSDB_PRIszt " blob files: %s", blob_files_.size(),
-                 blob_file_list.c_str());
+                 blob_file_oss.str().c_str());
+  ROCKS_LOG_INFO(
+      db_options_.info_log, "Found %" ROCKSDB_PRIszt " non-TTL blob files: %s",
+      live_imm_non_ttl_blob_files_.size(), live_imm_oss.str().c_str());
   ROCKS_LOG_INFO(db_options_.info_log,
                  "Found %" ROCKSDB_PRIszt
                  " incomplete or corrupted blob files: %s",
-                 obsolete_files_.size(), obsolete_file_list.c_str());
+                 obsolete_files_.size(), obsolete_file_oss.str().c_str());
   return s;
 }
 
+template <typename Linker>
+void BlobDBImpl::LinkSstToBlobFileImpl(uint64_t sst_file_number,
+                                       uint64_t blob_file_number,
+                                       Linker linker) {
+  assert(bdb_options_.enable_garbage_collection);
+  assert(blob_file_number != kInvalidBlobFileNumber);
+
+  auto it = blob_files_.find(blob_file_number);
+  if (it == blob_files_.end()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Blob file %" PRIu64
+                   " not found while trying to link "
+                   "SST file %" PRIu64,
+                   blob_file_number, sst_file_number);
+    return;
+  }
+
+  BlobFile* const blob_file = it->second.get();
+  assert(blob_file);
+
+  linker(blob_file, sst_file_number);
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Blob file %" PRIu64 " linked to SST file %" PRIu64,
+                 blob_file_number, sst_file_number);
+}
+
+void BlobDBImpl::LinkSstToBlobFile(uint64_t sst_file_number,
+                                   uint64_t blob_file_number) {
+  auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
+    WriteLock file_lock(&blob_file->mutex_);
+    blob_file->LinkSstFile(sst_file);
+  };
+
+  LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
+}
+
+void BlobDBImpl::LinkSstToBlobFileNoLock(uint64_t sst_file_number,
+                                         uint64_t blob_file_number) {
+  auto linker = [](BlobFile* blob_file, uint64_t sst_file) {
+    blob_file->LinkSstFile(sst_file);
+  };
+
+  LinkSstToBlobFileImpl(sst_file_number, blob_file_number, linker);
+}
+
+void BlobDBImpl::UnlinkSstFromBlobFile(uint64_t sst_file_number,
+                                       uint64_t blob_file_number) {
+  assert(bdb_options_.enable_garbage_collection);
+  assert(blob_file_number != kInvalidBlobFileNumber);
+
+  auto it = blob_files_.find(blob_file_number);
+  if (it == blob_files_.end()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Blob file %" PRIu64
+                   " not found while trying to unlink "
+                   "SST file %" PRIu64,
+                   blob_file_number, sst_file_number);
+    return;
+  }
+
+  BlobFile* const blob_file = it->second.get();
+  assert(blob_file);
+
+  {
+    WriteLock file_lock(&blob_file->mutex_);
+    blob_file->UnlinkSstFile(sst_file_number);
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Blob file %" PRIu64 " unlinked from SST file %" PRIu64,
+                 blob_file_number, sst_file_number);
+}
+
+void BlobDBImpl::InitializeBlobFileToSstMapping(
+    const std::vector<LiveFileMetaData>& live_files) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  for (const auto& live_file : live_files) {
+    const uint64_t sst_file_number = live_file.file_number;
+    const uint64_t blob_file_number = live_file.oldest_blob_file_number;
+
+    if (blob_file_number == kInvalidBlobFileNumber) {
+      continue;
+    }
+
+    LinkSstToBlobFileNoLock(sst_file_number, blob_file_number);
+  }
+}
+
+void BlobDBImpl::ProcessFlushJobInfo(const FlushJobInfo& info) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  WriteLock lock(&mutex_);
+
+  if (info.oldest_blob_file_number != kInvalidBlobFileNumber) {
+    LinkSstToBlobFile(info.file_number, info.oldest_blob_file_number);
+  }
+
+  assert(flush_sequence_ < info.largest_seqno);
+  flush_sequence_ = info.largest_seqno;
+
+  MarkUnreferencedBlobFilesObsolete();
+}
+
+void BlobDBImpl::ProcessCompactionJobInfo(const CompactionJobInfo& info) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  // Note: the same SST file may appear in both the input and the output
+  // file list in case of a trivial move. We process the inputs first
+  // to ensure the blob file still has a link after processing all updates.
+
+  WriteLock lock(&mutex_);
+
+  for (const auto& input : info.input_file_infos) {
+    if (input.oldest_blob_file_number == kInvalidBlobFileNumber) {
+      continue;
+    }
+
+    UnlinkSstFromBlobFile(input.file_number, input.oldest_blob_file_number);
+  }
+
+  for (const auto& output : info.output_file_infos) {
+    if (output.oldest_blob_file_number == kInvalidBlobFileNumber) {
+      continue;
+    }
+
+    LinkSstToBlobFile(output.file_number, output.oldest_blob_file_number);
+  }
+
+  MarkUnreferencedBlobFilesObsolete();
+}
+
+bool BlobDBImpl::MarkBlobFileObsoleteIfNeeded(
+    const std::shared_ptr<BlobFile>& blob_file, SequenceNumber obsolete_seq) {
+  assert(blob_file);
+  assert(!blob_file->HasTTL());
+  assert(blob_file->Immutable());
+  assert(bdb_options_.enable_garbage_collection);
+
+  // Note: FIFO eviction could have marked this file obsolete already.
+  if (blob_file->Obsolete()) {
+    return true;
+  }
+
+  // We cannot mark this file (or any higher-numbered files for that matter)
+  // obsolete if it is referenced by any memtables or SSTs. We keep track of
+  // the SSTs explicitly. To account for memtables, we keep track of the highest
+  // sequence number received in flush notifications, and we do not mark the
+  // blob file obsolete if there are still unflushed memtables from before
+  // the time the blob file was closed.
+  if (blob_file->GetImmutableSequence() > flush_sequence_ ||
+      !blob_file->GetLinkedSstFiles().empty()) {
+    return false;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Blob file %" PRIu64 " is no longer needed, marking obsolete",
+                 blob_file->BlobFileNumber());
+
+  ObsoleteBlobFile(blob_file, obsolete_seq, /* update_size */ true);
+  return true;
+}
+
+template <class Functor>
+void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed) {
+  assert(bdb_options_.enable_garbage_collection);
+
+  // Iterate through all live immutable non-TTL blob files, and mark them
+  // obsolete assuming no SST files or memtables rely on the blobs in them.
+  // Note: we need to stop as soon as we find a blob file that has any
+  // linked SSTs (or one potentially referenced by memtables).
+
+  auto it = live_imm_non_ttl_blob_files_.begin();
+  while (it != live_imm_non_ttl_blob_files_.end()) {
+    const auto& blob_file = it->second;
+    assert(blob_file);
+    assert(blob_file->BlobFileNumber() == it->first);
+    assert(!blob_file->HasTTL());
+    assert(blob_file->Immutable());
+
+    // Small optimization: Obsolete() does an atomic read, so we can do
+    // this check without taking a lock on the blob file's mutex.
+    if (blob_file->Obsolete()) {
+      it = live_imm_non_ttl_blob_files_.erase(it);
+      continue;
+    }
+
+    if (!mark_if_needed(blob_file)) {
+      break;
+    }
+
+    it = live_imm_non_ttl_blob_files_.erase(it);
+  }
+}
+
+void BlobDBImpl::MarkUnreferencedBlobFilesObsolete() {
+  const SequenceNumber obsolete_seq = GetLatestSequenceNumber();
+
+  MarkUnreferencedBlobFilesObsoleteImpl(
+      [=](const std::shared_ptr<BlobFile>& blob_file) {
+        WriteLock file_lock(&blob_file->mutex_);
+        return MarkBlobFileObsoleteIfNeeded(blob_file, obsolete_seq);
+      });
+}
+
+void BlobDBImpl::MarkUnreferencedBlobFilesObsoleteDuringOpen() {
+  MarkUnreferencedBlobFilesObsoleteImpl(
+      [=](const std::shared_ptr<BlobFile>& blob_file) {
+        return MarkBlobFileObsoleteIfNeeded(blob_file, /* obsolete_seq */ 0);
+      });
+}
+
 void BlobDBImpl::CloseRandomAccessLocked(
     const std::shared_ptr<BlobFile>& bfile) {
   bfile->CloseRandomAccessLocked();
@@ -316,14 +583,24 @@ Status BlobDBImpl::GetBlobFileReader(
   return s;
 }
 
-std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(const std::string& reason) {
+std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(
+    bool has_ttl, const ExpirationRange& expiration_range,
+    const std::string& reason) {
+  assert(has_ttl == (expiration_range.first || expiration_range.second));
+
   uint64_t file_num = next_file_number_++;
-  auto bfile = std::make_shared<BlobFile>(this, blob_dir_, file_num,
-                                          db_options_.info_log.get());
+
+  const uint32_t column_family_id =
+      static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
+  auto blob_file = std::make_shared<BlobFile>(
+      this, blob_dir_, file_num, db_options_.info_log.get(), column_family_id,
+      bdb_options_.compression, has_ttl, expiration_range);
+
   ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'",
-                  bfile->PathName().c_str(), reason.c_str());
+                  blob_file->PathName().c_str(), reason.c_str());
   LogFlush(db_options_.info_log);
-  return bfile;
+
+  return blob_file;
 }
 
 Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
@@ -419,47 +696,29 @@ Status BlobDBImpl::CheckOrCreateWriterLocked(
   return s;
 }
 
-Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
-  assert(blob_file != nullptr);
-  {
-    ReadLock rl(&mutex_);
-    if (open_non_ttl_file_ != nullptr) {
-      *blob_file = open_non_ttl_file_;
-      return Status::OK();
-    }
-  }
-
-  // CHECK again
-  WriteLock wl(&mutex_);
-  if (open_non_ttl_file_ != nullptr) {
-    *blob_file = open_non_ttl_file_;
-    return Status::OK();
-  }
+Status BlobDBImpl::CreateBlobFileAndWriter(
+    bool has_ttl, const ExpirationRange& expiration_range,
+    const std::string& reason, std::shared_ptr<BlobFile>* blob_file,
+    std::shared_ptr<Writer>* writer) {
+  assert(has_ttl == (expiration_range.first || expiration_range.second));
+  assert(blob_file);
+  assert(writer);
 
-  *blob_file = NewBlobFile("SelectBlobFile");
-  assert(*blob_file != nullptr);
+  *blob_file = NewBlobFile(has_ttl, expiration_range, reason);
+  assert(*blob_file);
 
   // file not visible, hence no lock
-  std::shared_ptr<Writer> writer;
-  Status s = CheckOrCreateWriterLocked(*blob_file, &writer);
+  Status s = CheckOrCreateWriterLocked(*blob_file, writer);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to get writer from blob file: %s, error: %s",
+                    "Failed to get writer for blob file: %s, error: %s",
                     (*blob_file)->PathName().c_str(), s.ToString().c_str());
     return s;
   }
 
-  (*blob_file)->file_size_ = BlobLogHeader::kSize;
-  (*blob_file)->header_.compression = bdb_options_.compression;
-  (*blob_file)->header_.has_ttl = false;
-  (*blob_file)->header_.column_family_id =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
-  (*blob_file)->header_valid_ = true;
-  (*blob_file)->SetColumnFamilyId((*blob_file)->header_.column_family_id);
-  (*blob_file)->SetHasTTL(false);
-  (*blob_file)->SetCompression(bdb_options_.compression);
+  assert(*writer);
 
-  s = writer->WriteHeader((*blob_file)->header_);
+  s = (*writer)->WriteHeader((*blob_file)->header_);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to write header to new blob file: %s"
@@ -468,93 +727,92 @@ Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
     return s;
   }
 
-  blob_files_.insert(
-      std::make_pair((*blob_file)->BlobFileNumber(), *blob_file));
-  open_non_ttl_file_ = *blob_file;
+  (*blob_file)->SetFileSize(BlobLogHeader::kSize);
   total_blob_size_ += BlobLogHeader::kSize;
+
   return s;
 }
 
-Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration,
-                                     std::shared_ptr<BlobFile>* blob_file) {
-  assert(blob_file != nullptr);
-  assert(expiration != kNoExpiration);
-  uint64_t epoch_read = 0;
+Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file);
+
   {
     ReadLock rl(&mutex_);
-    *blob_file = FindBlobFileLocked(expiration);
-    epoch_read = epoch_of_.load();
-  }
 
-  if (*blob_file != nullptr) {
-    assert(!(*blob_file)->Immutable());
-    return Status::OK();
+    if (open_non_ttl_file_) {
+      assert(!open_non_ttl_file_->Immutable());
+      *blob_file = open_non_ttl_file_;
+      return Status::OK();
+    }
   }
 
-  uint64_t exp_low =
-      (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
-  uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
-  ExpirationRange expiration_range = std::make_pair(exp_low, exp_high);
-
-  *blob_file = NewBlobFile("SelectBlobFileTTL");
-  assert(*blob_file != nullptr);
+  // Check again
+  WriteLock wl(&mutex_);
 
-  ROCKS_LOG_INFO(db_options_.info_log,
-                 "New blob file TTL range: %s %" PRIu64 " %" PRIu64,
-                 (*blob_file)->PathName().c_str(), exp_low, exp_high);
-  LogFlush(db_options_.info_log);
+  if (open_non_ttl_file_) {
+    assert(!open_non_ttl_file_->Immutable());
+    *blob_file = open_non_ttl_file_;
+    return Status::OK();
+  }
 
-  // we don't need to take lock as no other thread is seeing bfile yet
   std::shared_ptr<Writer> writer;
-  Status s = CheckOrCreateWriterLocked(*blob_file, &writer);
+  const Status s = CreateBlobFileAndWriter(
+      /* has_ttl */ false, ExpirationRange(),
+      /* reason */ "SelectBlobFile", blob_file, &writer);
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(
-        db_options_.info_log,
-        "Failed to get writer from blob file with TTL: %s, error: %s",
-        (*blob_file)->PathName().c_str(), s.ToString().c_str());
     return s;
   }
 
-  (*blob_file)->header_.expiration_range = expiration_range;
-  (*blob_file)->header_.compression = bdb_options_.compression;
-  (*blob_file)->header_.has_ttl = true;
-  (*blob_file)->header_.column_family_id =
-      reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
-  (*blob_file)->header_valid_ = true;
-  (*blob_file)->SetColumnFamilyId((*blob_file)->header_.column_family_id);
-  (*blob_file)->SetHasTTL(true);
-  (*blob_file)->SetCompression(bdb_options_.compression);
-  (*blob_file)->file_size_ = BlobLogHeader::kSize;
+  blob_files_.insert(std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+      (*blob_file)->BlobFileNumber(), *blob_file));
+  open_non_ttl_file_ = *blob_file;
 
-  // set the first value of the range, since that is
-  // concrete at this time.  also necessary to add to open_ttl_files_
-  (*blob_file)->expiration_range_ = expiration_range;
+  return s;
+}
 
-  WriteLock wl(&mutex_);
-  // in case the epoch has shifted in the interim, then check
-  // check condition again - should be rare.
-  if (epoch_of_.load() != epoch_read) {
-    std::shared_ptr<BlobFile> blob_file2 = FindBlobFileLocked(expiration);
-    if (blob_file2 != nullptr) {
-      *blob_file = std::move(blob_file2);
+Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration,
+                                     std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file);
+  assert(expiration != kNoExpiration);
+
+  {
+    ReadLock rl(&mutex_);
+
+    *blob_file = FindBlobFileLocked(expiration);
+    if (*blob_file != nullptr) {
+      assert(!(*blob_file)->Immutable());
       return Status::OK();
     }
   }
 
-  s = writer->WriteHeader((*blob_file)->header_);
+  // Check again
+  WriteLock wl(&mutex_);
+
+  *blob_file = FindBlobFileLocked(expiration);
+  if (*blob_file != nullptr) {
+    assert(!(*blob_file)->Immutable());
+    return Status::OK();
+  }
+
+  const uint64_t exp_low =
+      (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs;
+  const uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
+  const ExpirationRange expiration_range(exp_low, exp_high);
+
+  std::ostringstream oss;
+  oss << "SelectBlobFileTTL range: [" << exp_low << ',' << exp_high << ')';
+
+  std::shared_ptr<Writer> writer;
+  const Status s =
+      CreateBlobFileAndWriter(/* has_ttl */ true, expiration_range,
+                              /* reason */ oss.str(), blob_file, &writer);
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to write header to new blob file: %s"
-                    " status: '%s'",
-                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
     return s;
   }
 
-  blob_files_.insert(
-      std::make_pair((*blob_file)->BlobFileNumber(), *blob_file));
+  blob_files_.insert(std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+      (*blob_file)->BlobFileNumber(), *blob_file));
   open_ttl_files_.insert(*blob_file);
-  total_blob_size_ += BlobLogHeader::kSize;
-  epoch_of_++;
 
   return s;
 }
@@ -723,6 +981,7 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/,
     }
     if (s.ok()) {
       if (expiration != kNoExpiration) {
+        WriteLock file_lock(&blob_file->mutex_);
         blob_file->ExtendExpirationRange(expiration);
       }
       s = CloseBlobFileIfNeeded(blob_file);
@@ -771,6 +1030,34 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
   return *compression_output;
 }
 
+Status BlobDBImpl::CompactFiles(
+    const CompactionOptions& compact_options,
+    const std::vector<std::string>& input_file_names, const int output_level,
+    const int output_path_id, std::vector<std::string>* const output_file_names,
+    CompactionJobInfo* compaction_job_info) {
+  // Note: we need CompactionJobInfo to be able to track updates to the
+  // blob file <-> SST mappings, so we provide one if the user hasn't,
+  // assuming that GC is enabled.
+  CompactionJobInfo info{};
+  if (bdb_options_.enable_garbage_collection && !compaction_job_info) {
+    compaction_job_info = &info;
+  }
+
+  const Status s =
+      db_->CompactFiles(compact_options, input_file_names, output_level,
+                        output_path_id, output_file_names, compaction_job_info);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (bdb_options_.enable_garbage_collection) {
+    assert(compaction_job_info);
+    ProcessCompactionJobInfo(*compaction_job_info);
+  }
+
+  return s;
+}
+
 void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) {
   ReadLock l(&mutex_);
 
@@ -845,11 +1132,12 @@ Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size,
     WriteLock file_lock(&blob_file->mutex_);
     if (blob_file->Obsolete()) {
       // File already obsoleted by someone else.
+      assert(blob_file->Immutable());
       continue;
     }
     // FIFO eviction can evict open blob files.
     if (!blob_file->Immutable()) {
-      Status s = CloseBlobFile(blob_file, false /*need_lock*/);
+      Status s = CloseBlobFile(blob_file);
       if (!s.ok()) {
         return s;
       }
@@ -963,15 +1251,18 @@ bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) {
 
 Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
                                 PinnableSlice* value, uint64_t* expiration) {
-  assert(value != nullptr);
+  assert(value);
+
   BlobIndex blob_index;
   Status s = blob_index.DecodeFrom(index_entry);
   if (!s.ok()) {
     return s;
   }
+
   if (blob_index.HasTTL() && blob_index.expiration() <= EpochNow()) {
     return Status::NotFound("Key expired");
   }
+
   if (expiration != nullptr) {
     if (blob_index.HasTTL()) {
       *expiration = blob_index.expiration();
@@ -979,13 +1270,65 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
       *expiration = kNoExpiration;
     }
   }
+
   if (blob_index.IsInlined()) {
     // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same
     // memory buffer to avoid extra copy.
     value->PinSelf(blob_index.value());
     return Status::OK();
   }
-  if (blob_index.size() == 0) {
+
+  CompressionType compression_type = kNoCompression;
+  s = GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(),
+                         blob_index.size(), value, &compression_type);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (compression_type != kNoCompression) {
+    BlockContents contents;
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+
+    {
+      StopWatch decompression_sw(env_, statistics_,
+                                 BLOB_DB_DECOMPRESSION_MICROS);
+      UncompressionContext context(compression_type);
+      UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                             compression_type);
+      s = UncompressBlockContentsForCompressionType(
+          info, value->data(), value->size(), &contents,
+          kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
+    }
+
+    if (!s.ok()) {
+      if (debug_level_ >= 2) {
+        ROCKS_LOG_ERROR(
+            db_options_.info_log,
+            "Uncompression error during blob read from file: %" PRIu64
+            " blob_offset: %" PRIu64 " blob_size: %" PRIu64
+            " key: %s status: '%s'",
+            blob_index.file_number(), blob_index.offset(), blob_index.size(),
+            key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
+      }
+
+      return Status::Corruption("Unable to uncompress blob.");
+    }
+
+    value->PinSelf(contents.data);
+  }
+
+  return Status::OK();
+}
+
+Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
+                                      uint64_t offset, uint64_t size,
+                                      PinnableSlice* value,
+                                      CompressionType* compression_type) {
+  assert(value);
+  assert(compression_type);
+  assert(*compression_type == kNoCompression);
+
+  if (!size) {
     value->PinSelf("");
     return Status::OK();
   }
@@ -993,47 +1336,46 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
   // offset has to have certain min, as we will read CRC
   // later from the Blob Header, which needs to be also a
   // valid offset.
-  if (blob_index.offset() <
+  if (offset <
       (BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key.size())) {
     if (debug_level_ >= 2) {
       ROCKS_LOG_ERROR(db_options_.info_log,
                       "Invalid blob index file_number: %" PRIu64
                       " blob_offset: %" PRIu64 " blob_size: %" PRIu64
                       " key: %s",
-                      blob_index.file_number(), blob_index.offset(),
-                      blob_index.size(), key.data());
+                      file_number, offset, size,
+                      key.ToString(/* output_hex */ true).c_str());
     }
+
     return Status::NotFound("Invalid blob offset");
   }
 
-  std::shared_ptr<BlobFile> bfile;
+  std::shared_ptr<BlobFile> blob_file;
+
   {
     ReadLock rl(&mutex_);
-    auto hitr = blob_files_.find(blob_index.file_number());
+    auto it = blob_files_.find(file_number);
 
     // file was deleted
-    if (hitr == blob_files_.end()) {
+    if (it == blob_files_.end()) {
       return Status::NotFound("Blob Not Found as blob file missing");
     }
 
-    bfile = hitr->second;
+    blob_file = it->second;
   }
 
-  if (blob_index.size() == 0 && value != nullptr) {
-    value->PinSelf("");
-    return Status::OK();
-  }
+  *compression_type = blob_file->compression();
 
   // takes locks when called
   std::shared_ptr<RandomAccessFileReader> reader;
-  s = GetBlobFileReader(bfile, &reader);
+  Status s = GetBlobFileReader(blob_file, &reader);
   if (!s.ok()) {
     return s;
   }
 
-  assert(blob_index.offset() > key.size() + sizeof(uint32_t));
-  uint64_t record_offset = blob_index.offset() - key.size() - sizeof(uint32_t);
-  uint64_t record_size = sizeof(uint32_t) + key.size() + blob_index.size();
+  assert(offset >= key.size() + sizeof(uint32_t));
+  const uint64_t record_offset = offset - key.size() - sizeof(uint32_t);
+  const uint64_t record_size = sizeof(uint32_t) + key.size() + size;
 
   // Allocate the buffer. This is safe in C++11
   std::string buffer_str(static_cast<size_t>(record_size), static_cast<char>(0));
@@ -1041,42 +1383,44 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
 
   // A partial blob record contain checksum, key and value.
   Slice blob_record;
+
   {
     StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
     s = reader->Read(record_offset, static_cast<size_t>(record_size), &blob_record, buffer);
     RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
   }
+
   if (!s.ok()) {
-    ROCKS_LOG_DEBUG(db_options_.info_log,
-                    "Failed to read blob from blob file %" PRIu64
-                    ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
-                    ", key_size: %" ROCKSDB_PRIszt ", status: '%s'",
-                    bfile->BlobFileNumber(), blob_index.offset(),
-                    blob_index.size(), key.size(), s.ToString().c_str());
+    ROCKS_LOG_DEBUG(
+        db_options_.info_log,
+        "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64
+        ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt ", status: '%s'",
+        file_number, offset, size, key.size(), s.ToString().c_str());
     return s;
   }
+
   if (blob_record.size() != record_size) {
     ROCKS_LOG_DEBUG(
         db_options_.info_log,
         "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64
         ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt
         ", read %" ROCKSDB_PRIszt " bytes, expected %" PRIu64 " bytes",
-        bfile->BlobFileNumber(), blob_index.offset(), blob_index.size(),
-        key.size(), blob_record.size(), record_size);
+        file_number, offset, size, key.size(), blob_record.size(), record_size);
 
     return Status::Corruption("Failed to retrieve blob from blob index.");
   }
+
   Slice crc_slice(blob_record.data(), sizeof(uint32_t));
   Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(),
-                   static_cast<size_t>(blob_index.size()));
-  uint32_t crc_exp;
+                   static_cast<size_t>(size));
+
+  uint32_t crc_exp = 0;
   if (!GetFixed32(&crc_slice, &crc_exp)) {
-    ROCKS_LOG_DEBUG(db_options_.info_log,
-                    "Unable to decode CRC from blob file %" PRIu64
-                    ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
-                    ", key size: %" ROCKSDB_PRIszt ", status: '%s'",
-                    bfile->BlobFileNumber(), blob_index.offset(),
-                    blob_index.size(), key.size(), s.ToString().c_str());
+    ROCKS_LOG_DEBUG(
+        db_options_.info_log,
+        "Unable to decode CRC from blob file %" PRIu64 ", blob_offset: %" PRIu64
+        ", blob_size: %" PRIu64 ", key size: %" ROCKSDB_PRIszt ", status: '%s'",
+        file_number, offset, size, key.size(), s.ToString().c_str());
     return Status::Corruption("Unable to decode checksum.");
   }
 
@@ -1085,34 +1429,20 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
   crc = crc32c::Mask(crc);  // Adjust for storage
   if (crc != crc_exp) {
     if (debug_level_ >= 2) {
-      ROCKS_LOG_ERROR(db_options_.info_log,
-                      "Blob crc mismatch file: %s blob_offset: %" PRIu64
-                      " blob_size: %" PRIu64 " key: %s status: '%s'",
-                      bfile->PathName().c_str(), blob_index.offset(),
-                      blob_index.size(), key.data(), s.ToString().c_str());
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "Blob crc mismatch file: %" PRIu64 " blob_offset: %" PRIu64
+          " blob_size: %" PRIu64 " key: %s status: '%s'",
+          file_number, offset, size,
+          key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
     }
+
     return Status::Corruption("Corruption. Blob CRC mismatch");
   }
 
-  if (bfile->compression() == kNoCompression) {
-    value->PinSelf(blob_value);
-  } else {
-    BlockContents contents;
-    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
-    {
-      StopWatch decompression_sw(env_, statistics_,
-                                 BLOB_DB_DECOMPRESSION_MICROS);
-      UncompressionContext context(bfile->compression());
-      UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
-                             bfile->compression());
-      s = UncompressBlockContentsForCompressionType(
-          info, blob_value.data(), blob_value.size(), &contents,
-          kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
-    }
-    value->PinSelf(contents.data);
-  }
+  value->PinSelf(blob_value);
 
-  return s;
+  return Status::OK();
 }
 
 Status BlobDBImpl::Get(const ReadOptions& read_options,
@@ -1145,9 +1475,11 @@ Status BlobDBImpl::GetImpl(const ReadOptions& read_options,
   PinnableSlice index_entry;
   Status s;
   bool is_blob_index = false;
-  s = db_impl_->GetImpl(ro, column_family, key, &index_entry,
-                        nullptr /*value_found*/, nullptr /*read_callback*/,
-                        &is_blob_index);
+  DBImpl::GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = &index_entry;
+  get_impl_options.is_blob_index = &is_blob_index;
+  s = db_impl_->GetImpl(ro, key, get_impl_options);
   TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
   TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2");
   if (expiration != nullptr) {
@@ -1174,14 +1506,24 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
     return std::make_pair(false, -1);
   }
 
+  ReadLock rl(&mutex_);
+
   ROCKS_LOG_INFO(db_options_.info_log, "Starting Sanity Check");
   ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" ROCKSDB_PRIszt,
                  blob_files_.size());
   ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" ROCKSDB_PRIszt,
                  open_ttl_files_.size());
 
-  for (auto bfile : open_ttl_files_) {
-    assert(!bfile->Immutable());
+  for (const auto& blob_file : open_ttl_files_) {
+    (void)blob_file;
+    assert(!blob_file->Immutable());
+  }
+
+  for (const auto& pair : live_imm_non_ttl_blob_files_) {
+    const auto& blob_file = pair.second;
+    (void)blob_file;
+    assert(!blob_file->HasTTL());
+    assert(blob_file->Immutable());
   }
 
   uint64_t now = EpochNow();
@@ -1195,7 +1537,13 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
                        blob_file->BlobFileNumber(), blob_file->GetFileSize(),
                        blob_file->BlobCount(), blob_file->Immutable());
     if (blob_file->HasTTL()) {
-      auto expiration_range = blob_file->GetExpirationRange();
+      ExpirationRange expiration_range;
+
+      {
+        ReadLock file_lock(&blob_file->mutex_);
+        expiration_range = blob_file->GetExpirationRange();
+      }
+
       pos += snprintf(buf + pos, sizeof(buf) - pos,
                       ", expiration range (%" PRIu64 ", %" PRIu64 ")",
                       expiration_range.first, expiration_range.second);
@@ -1217,58 +1565,75 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
   return std::make_pair(true, -1);
 }
 
-Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile,
-                                 bool need_lock) {
-  assert(bfile != nullptr);
+Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile) {
+  assert(bfile);
+  assert(!bfile->Immutable());
+  assert(!bfile->Obsolete());
   write_mutex_.AssertHeld();
-  Status s;
+
   ROCKS_LOG_INFO(db_options_.info_log,
                  "Closing blob file %" PRIu64 ". Path: %s",
                  bfile->BlobFileNumber(), bfile->PathName().c_str());
-  {
-    std::unique_ptr<WriteLock> lock;
-    if (need_lock) {
-      lock.reset(new WriteLock(&mutex_));
-    }
 
-    if (bfile->HasTTL()) {
-      size_t erased __attribute__((__unused__));
-      erased = open_ttl_files_.erase(bfile);
-    } else if (bfile == open_non_ttl_file_) {
-      open_non_ttl_file_ = nullptr;
-    }
-  }
+  const SequenceNumber sequence = GetLatestSequenceNumber();
 
-  if (!bfile->closed_.load()) {
-    std::unique_ptr<WriteLock> file_lock;
-    if (need_lock) {
-      file_lock.reset(new WriteLock(&bfile->mutex_));
-    }
-    s = bfile->WriteFooterAndCloseLocked();
-  }
+  const Status s = bfile->WriteFooterAndCloseLocked(sequence);
 
   if (s.ok()) {
     total_blob_size_ += BlobLogFooter::kSize;
   } else {
+    bfile->MarkImmutable(sequence);
+
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to close blob file %" PRIu64 "with error: %s",
                     bfile->BlobFileNumber(), s.ToString().c_str());
   }
 
+  if (bfile->HasTTL()) {
+    size_t erased __attribute__((__unused__));
+    erased = open_ttl_files_.erase(bfile);
+  } else {
+    if (bfile == open_non_ttl_file_) {
+      open_non_ttl_file_ = nullptr;
+    }
+
+    const uint64_t blob_file_number = bfile->BlobFileNumber();
+    auto it = live_imm_non_ttl_blob_files_.lower_bound(blob_file_number);
+    assert(it == live_imm_non_ttl_blob_files_.end() ||
+           it->first != blob_file_number);
+    live_imm_non_ttl_blob_files_.insert(
+        it, std::map<uint64_t, std::shared_ptr<BlobFile>>::value_type(
+                blob_file_number, bfile));
+  }
+
   return s;
 }
 
 Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile) {
+  write_mutex_.AssertHeld();
+
   // atomic read
   if (bfile->GetFileSize() < bdb_options_.blob_file_size) {
     return Status::OK();
   }
+
+  WriteLock lock(&mutex_);
+  WriteLock file_lock(&bfile->mutex_);
+
+  assert(!bfile->Obsolete() || bfile->Immutable());
+  if (bfile->Immutable()) {
+    return Status::OK();
+  }
+
   return CloseBlobFile(bfile);
 }
 
 void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr<BlobFile> blob_file,
                                   SequenceNumber obsolete_seq,
                                   bool update_size) {
+  assert(blob_file->Immutable());
+  assert(!blob_file->Obsolete());
+
   // Should hold write lock of mutex_ or during DB open.
   blob_file->MarkObsolete(obsolete_seq);
   obsolete_files_.push_back(blob_file);
@@ -1339,15 +1704,23 @@ std::pair<bool, int64_t> BlobDBImpl::EvictExpiredFiles(bool aborted) {
   SequenceNumber seq = GetLatestSequenceNumber();
   {
     MutexLock l(&write_mutex_);
+    WriteLock lock(&mutex_);
     for (auto& blob_file : process_files) {
       WriteLock file_lock(&blob_file->mutex_);
-      if (!blob_file->Immutable()) {
-        CloseBlobFile(blob_file, false /*need_lock*/);
-      }
+
       // Need to double check if the file is obsolete.
-      if (!blob_file->Obsolete()) {
-        ObsoleteBlobFile(blob_file, seq, true /*update_size*/);
+      if (blob_file->Obsolete()) {
+        assert(blob_file->Immutable());
+        continue;
       }
+
+      if (!blob_file->Immutable()) {
+        CloseBlobFile(blob_file);
+      }
+
+      assert(blob_file->Immutable());
+
+      ObsoleteBlobFile(blob_file, seq, true /*update_size*/);
     }
   }
 
@@ -1419,14 +1792,14 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
       : cfd_(cfd), key_(key), upper_bound_(upper_bound) {}
 
   Status Callback(DB* db) override {
-    auto* db_impl = reinterpret_cast<DBImpl*>(db);
+    auto* db_impl = static_cast_with_check<DBImpl, DB>(db);
     auto* sv = db_impl->GetAndRefSuperVersion(cfd_);
     SequenceNumber latest_seq = 0;
     bool found_record_for_key = false;
     bool is_blob_index = false;
     Status s = db_impl->GetLatestSequenceForKey(
-        sv, key_, false /*cache_only*/, &latest_seq, &found_record_for_key,
-        &is_blob_index);
+        sv, key_, false /*cache_only*/, 0 /*lower_bound_seq*/, &latest_seq,
+        &found_record_for_key, &is_blob_index);
     db_impl->ReturnAndCleanupSuperVersion(cfd_, sv);
     if (!s.ok() && !s.IsNotFound()) {
       // Error.
@@ -1498,7 +1871,14 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
   // this reads the key but skips the blob
   Reader::ReadLevel shallow = Reader::kReadHeaderKey;
 
-  bool file_expired = has_ttl && now >= bfptr->GetExpirationRange().second;
+  ExpirationRange expiration_range;
+
+  {
+    ReadLock file_lock(&bfptr->mutex_);
+    expiration_range = bfptr->GetExpirationRange();
+  }
+
+  bool file_expired = has_ttl && now >= expiration_range.second;
 
   if (!file_expired) {
     // read the blob because you have to write it back to new file
@@ -1534,9 +1914,12 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
     SequenceNumber latest_seq = GetLatestSequenceNumber();
     bool is_blob_index = false;
     PinnableSlice index_entry;
-    Status get_status = db_impl_->GetImpl(
-        ReadOptions(), cfh, record.key, &index_entry, nullptr /*value_found*/,
-        nullptr /*read_callback*/, &is_blob_index);
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = cfh;
+    get_impl_options.value = &index_entry;
+    get_impl_options.is_blob_index = &is_blob_index;
+    Status get_status =
+        db_impl_->GetImpl(ReadOptions(), record.key, get_impl_options);
     TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB");
     if (!get_status.ok() && !get_status.IsNotFound()) {
       // error
@@ -1603,7 +1986,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
       // new file
       std::string reason("GC of ");
       reason += bfptr->PathName();
-      newfile = NewBlobFile(reason);
+      newfile = NewBlobFile(bfptr->HasTTL(), bfptr->expiration_range_, reason);
 
       s = CheckOrCreateWriterLocked(newfile, &new_writer);
       if (!s.ok()) {
@@ -1612,14 +1995,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
                         newfile->PathName().c_str(), s.ToString().c_str());
         break;
       }
-      // Can't use header beyond this point
-      newfile->header_ = std::move(header);
-      newfile->header_valid_ = true;
       newfile->file_size_ = BlobLogHeader::kSize;
-      newfile->SetColumnFamilyId(bfptr->column_family_id());
-      newfile->SetHasTTL(bfptr->HasTTL());
-      newfile->SetCompression(bfptr->compression());
-      newfile->expiration_range_ = bfptr->expiration_range_;
 
       s = new_writer->WriteHeader(newfile->header_);
       if (!s.ok()) {
@@ -1702,6 +2078,8 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
   if (newfile != nullptr) {
     {
       MutexLock l(&write_mutex_);
+      WriteLock lock(&mutex_);
+      WriteLock file_lock(&newfile->mutex_);
       CloseBlobFile(newfile);
     }
     total_blob_size_ += newfile->file_size_;
@@ -1755,9 +2133,14 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
                    "Will delete file due to snapshot success %s",
                    bfile->PathName().c_str());
 
-    blob_files_.erase(bfile->BlobFileNumber());
+    {
+      WriteLock wl(&mutex_);
+      blob_files_.erase(bfile->BlobFileNumber());
+    }
+
     Status s = DeleteDBFile(&(db_impl_->immutable_db_options()),
-                             bfile->PathName(), blob_dir_, true);
+                            bfile->PathName(), blob_dir_, true,
+                            /*force_fg=*/false);
     if (!s.ok()) {
       ROCKS_LOG_ERROR(db_options_.info_log,
                       "File failed to be deleted as obsolete %s",
@@ -1787,6 +2170,7 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
   if (!tobsolete.empty()) {
     WriteLock wl(&mutex_);
     for (auto bfile : tobsolete) {
+      blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
       obsolete_files_.push_front(bfile);
     }
   }
@@ -1847,7 +2231,8 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options,
     uint64_t number;
     FileType type;
     if (ParseFileName(f, &number, &type) && type == kBlobFile) {
-      Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true);
+      Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true,
+                                /*force_fg=*/false);
       if (status.ok() && !del.ok()) {
         status = del;
       }
@@ -1869,6 +2254,16 @@ Status BlobDBImpl::TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
   return GetBlobValue(key, index_entry, value);
 }
 
+void BlobDBImpl::TEST_AddDummyBlobFile(uint64_t blob_file_number,
+                                       SequenceNumber immutable_sequence) {
+  auto blob_file = std::make_shared<BlobFile>(this, blob_dir_, blob_file_number,
+                                              db_options_.info_log.get());
+  blob_file->MarkImmutable(immutable_sequence);
+
+  blob_files_[blob_file_number] = blob_file;
+  live_imm_non_ttl_blob_files_[blob_file_number] = blob_file;
+}
+
 std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
   ReadLock l(&mutex_);
   std::vector<std::shared_ptr<BlobFile>> blob_files;
@@ -1878,6 +2273,16 @@ std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetBlobFiles() const {
   return blob_files;
 }
 
+std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetLiveImmNonTTLFiles()
+    const {
+  ReadLock l(&mutex_);
+  std::vector<std::shared_ptr<BlobFile>> live_imm_non_ttl_files;
+  for (const auto& pair : live_imm_non_ttl_blob_files_) {
+    live_imm_non_ttl_files.emplace_back(pair.second);
+  }
+  return live_imm_non_ttl_files;
+}
+
 std::vector<std::shared_ptr<BlobFile>> BlobDBImpl::TEST_GetObsoleteFiles()
     const {
   ReadLock l(&mutex_);
@@ -1894,6 +2299,9 @@ void BlobDBImpl::TEST_DeleteObsoleteFiles() {
 
 Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile) {
   MutexLock l(&write_mutex_);
+  WriteLock lock(&mutex_);
+  WriteLock file_lock(&bfile->mutex_);
+
   return CloseBlobFile(bfile);
 }
 
@@ -1915,6 +2323,20 @@ void BlobDBImpl::TEST_EvictExpiredFiles() {
 }
 
 uint64_t BlobDBImpl::TEST_live_sst_size() { return live_sst_size_.load(); }
+
+void BlobDBImpl::TEST_InitializeBlobFileToSstMapping(
+    const std::vector<LiveFileMetaData>& live_files) {
+  InitializeBlobFileToSstMapping(live_files);
+}
+
+void BlobDBImpl::TEST_ProcessFlushJobInfo(const FlushJobInfo& info) {
+  ProcessFlushJobInfo(info);
+}
+
+void BlobDBImpl::TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info) {
+  ProcessCompactionJobInfo(info);
+}
+
 #endif  //  !NDEBUG
 
 }  // namespace blob_db
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 0a22c0acd9b..18aed63f82d 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -77,6 +77,8 @@ struct GCStats {
 class BlobDBImpl : public BlobDB {
   friend class BlobFile;
   friend class BlobDBIterator;
+  friend class BlobDBListener;
+  friend class BlobDBListenerGC;
 
  public:
   // deletions check period
@@ -148,6 +150,14 @@ class BlobDBImpl : public BlobDB {
   Status PutUntil(const WriteOptions& options, const Slice& key,
                   const Slice& value, uint64_t expiration) override;
 
+  using BlobDB::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override;
+
   BlobDBOptions GetBlobDBOptions() const override;
 
   BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options,
@@ -169,16 +179,19 @@ class BlobDBImpl : public BlobDB {
 
   Status SyncBlobFiles() override;
 
-  void UpdateLiveSSTSize();
-
   void GetCompactionContext(BlobCompactionContext* context);
 
 #ifndef NDEBUG
   Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
                            PinnableSlice* value);
 
+  void TEST_AddDummyBlobFile(uint64_t blob_file_number,
+                             SequenceNumber immutable_sequence);
+
   std::vector<std::shared_ptr<BlobFile>> TEST_GetBlobFiles() const;
 
+  std::vector<std::shared_ptr<BlobFile>> TEST_GetLiveImmNonTTLFiles() const;
+
   std::vector<std::shared_ptr<BlobFile>> TEST_GetObsoleteFiles() const;
 
   Status TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile);
@@ -199,6 +212,14 @@ class BlobDBImpl : public BlobDB {
   uint64_t TEST_live_sst_size();
 
   const std::string& TEST_blob_dir() const { return blob_dir_; }
+
+  void TEST_InitializeBlobFileToSstMapping(
+      const std::vector<LiveFileMetaData>& live_files);
+
+  void TEST_ProcessFlushJobInfo(const FlushJobInfo& info);
+
+  void TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info);
+
 #endif  //  !NDEBUG
 
  private:
@@ -216,13 +237,21 @@ class BlobDBImpl : public BlobDB {
   Status GetBlobValue(const Slice& key, const Slice& index_entry,
                       PinnableSlice* value, uint64_t* expiration = nullptr);
 
+  Status GetRawBlobFromFile(const Slice& key, uint64_t file_number,
+                            uint64_t offset, uint64_t size,
+                            PinnableSlice* value,
+                            CompressionType* compression_type);
+
   Slice GetCompressedSlice(const Slice& raw,
                            std::string* compression_output) const;
 
   // Close a file by appending a footer, and removes file from open files list.
-  Status CloseBlobFile(std::shared_ptr<BlobFile> bfile, bool need_lock = true);
+  // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_
+  // and the blob file's mutex_.
+  Status CloseBlobFile(std::shared_ptr<BlobFile> bfile);
 
   // Close a file if its size exceeds blob_file_size
+  // REQUIRES: lock held on write_mutex_.
   Status CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile);
 
   // Mark file as obsolete and move the file to obsolete file list.
@@ -240,14 +269,22 @@ class BlobDBImpl : public BlobDB {
                     const Slice& value, uint64_t expiration,
                     std::string* index_entry);
 
-  // find an existing blob log file based on the expiration unix epoch
-  // if such a file does not exist, return nullptr
-  Status SelectBlobFileTTL(uint64_t expiration,
-                           std::shared_ptr<BlobFile>* blob_file);
+  // Create a new blob file and associated writer.
+  Status CreateBlobFileAndWriter(bool has_ttl,
+                                 const ExpirationRange& expiration_range,
+                                 const std::string& reason,
+                                 std::shared_ptr<BlobFile>* blob_file,
+                                 std::shared_ptr<Writer>* writer);
 
-  // find an existing blob log file to append the value to
+  // Get the open non-TTL blob log file, or create a new one if no such file
+  // exists.
   Status SelectBlobFile(std::shared_ptr<BlobFile>* blob_file);
 
+  // Get the open TTL blob log file for a certain expiration, or create a new
+  // one if no such file exists.
+  Status SelectBlobFileTTL(uint64_t expiration,
+                           std::shared_ptr<BlobFile>* blob_file);
+
   std::shared_ptr<BlobFile> FindBlobFileLocked(uint64_t expiration) const;
 
   // periodic sanity check. Bunch of checks
@@ -276,7 +313,9 @@ class BlobDBImpl : public BlobDB {
   void StartBackgroundTasks();
 
   // add a new Blob File
-  std::shared_ptr<BlobFile> NewBlobFile(const std::string& reason);
+  std::shared_ptr<BlobFile> NewBlobFile(bool has_ttl,
+                                        const ExpirationRange& expiration_range,
+                                        const std::string& reason);
 
   // collect all the blob log files from the blob directory
   Status GetAllBlobFiles(std::set<uint64_t>* file_numbers);
@@ -284,6 +323,51 @@ class BlobDBImpl : public BlobDB {
   // Open all blob files found in blob_dir.
   Status OpenAllBlobFiles();
 
+  // Link an SST to a blob file. Comes in locking and non-locking varieties
+  // (the latter is used during Open).
+  template <typename Linker>
+  void LinkSstToBlobFileImpl(uint64_t sst_file_number,
+                             uint64_t blob_file_number, Linker linker);
+
+  void LinkSstToBlobFile(uint64_t sst_file_number, uint64_t blob_file_number);
+
+  void LinkSstToBlobFileNoLock(uint64_t sst_file_number,
+                               uint64_t blob_file_number);
+
+  // Unlink an SST from a blob file.
+  void UnlinkSstFromBlobFile(uint64_t sst_file_number,
+                             uint64_t blob_file_number);
+
+  // Initialize the mapping between blob files and SSTs during Open.
+  void InitializeBlobFileToSstMapping(
+      const std::vector<LiveFileMetaData>& live_files);
+
+  // Update the mapping between blob files and SSTs after a flush and mark
+  // any unneeded blob files obsolete.
+  void ProcessFlushJobInfo(const FlushJobInfo& info);
+
+  // Update the mapping between blob files and SSTs after a compaction and
+  // mark any unneeded blob files obsolete.
+  void ProcessCompactionJobInfo(const CompactionJobInfo& info);
+
+  // Mark an immutable non-TTL blob file obsolete assuming it has no more SSTs
+  // linked to it, and all memtables from before the blob file became immutable
+  // have been flushed. Note: should only be called if the condition holds for
+  // all lower-numbered non-TTL blob files as well.
+  bool MarkBlobFileObsoleteIfNeeded(const std::shared_ptr<BlobFile>& blob_file,
+                                    SequenceNumber obsolete_seq);
+
+  // Mark all immutable non-TTL blob files that aren't needed by any SSTs as
+  // obsolete. Comes in two varieties; the version used during Open need not
+  // worry about locking or snapshots.
+  template <class Functor>
+  void MarkUnreferencedBlobFilesObsoleteImpl(Functor mark_if_needed);
+
+  void MarkUnreferencedBlobFilesObsolete();
+  void MarkUnreferencedBlobFilesObsoleteDuringOpen();
+
+  void UpdateLiveSSTSize();
+
   Status GetBlobFileReader(const std::shared_ptr<BlobFile>& blob_file,
                            std::shared_ptr<RandomAccessFileReader>* reader);
 
@@ -359,8 +443,11 @@ class BlobDBImpl : public BlobDB {
   // entire metadata of all the BLOB files memory
   std::map<uint64_t, std::shared_ptr<BlobFile>> blob_files_;
 
-  // epoch or version of the open files.
-  std::atomic<uint64_t> epoch_of_;
+  // All live immutable non-TTL blob files.
+  std::map<uint64_t, std::shared_ptr<BlobFile>> live_imm_non_ttl_blob_files_;
+
+  // The largest sequence number that has been flushed.
+  SequenceNumber flush_sequence_;
 
   // opened non-TTL blob file.
   std::shared_ptr<BlobFile> open_non_ttl_file_;
diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc
index 8effe88c0a6..4544ecb5a2e 100644
--- a/utilities/blob_db/blob_db_impl_filesnapshot.cc
+++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -7,8 +7,8 @@
 
 #include "utilities/blob_db/blob_db_impl.h"
 
-#include "util/filename.h"
-#include "util/logging.h"
+#include "file/filename.h"
+#include "logging/logging.h"
 #include "util/mutexlock.h"
 
 // BlobDBImpl methods to get snapshot of files, e.g. for replication.
@@ -94,9 +94,10 @@ void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
     auto blob_file = bfile_pair.second;
     LiveFileMetaData filemetadata;
     filemetadata.size = static_cast<size_t>(blob_file->GetFileSize());
+    const uint64_t file_number = blob_file->BlobFileNumber();
     // Path should be relative to db_name, but begin with slash.
-    filemetadata.name =
-        BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber());
+    filemetadata.name = BlobFileName("", bdb_options_.blob_dir, file_number);
+    filemetadata.file_number = file_number;
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
     filemetadata.column_family_name = cfh->GetName();
     metadata->emplace_back(filemetadata);
diff --git a/utilities/blob_db/blob_db_iterator.h b/utilities/blob_db/blob_db_iterator.h
index 1565c670b18..f332445deac 100644
--- a/utilities/blob_db/blob_db_iterator.h
+++ b/utilities/blob_db/blob_db_iterator.h
@@ -6,6 +6,7 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
+#include "db/arena_wrapped_db_iter.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/iterator.h"
 #include "util/stop_watch.h"
diff --git a/utilities/blob_db/blob_db_listener.h b/utilities/blob_db/blob_db_listener.h
index f096d238ba3..af976af810c 100644
--- a/utilities/blob_db/blob_db_listener.h
+++ b/utilities/blob_db/blob_db_listener.h
@@ -37,10 +37,30 @@ class BlobDBListener : public EventListener {
     blob_db_impl_->UpdateLiveSSTSize();
   }
 
- private:
+ protected:
   BlobDBImpl* blob_db_impl_;
 };
 
+class BlobDBListenerGC : public BlobDBListener {
+ public:
+  explicit BlobDBListenerGC(BlobDBImpl* blob_db_impl)
+      : BlobDBListener(blob_db_impl) {}
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    BlobDBListener::OnFlushCompleted(db, info);
+
+    assert(blob_db_impl_);
+    blob_db_impl_->ProcessFlushJobInfo(info);
+  }
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
+    BlobDBListener::OnCompactionCompleted(db, info);
+
+    assert(blob_db_impl_);
+    blob_db_impl_->ProcessCompactionJobInfo(info);
+  }
+};
+
 }  // namespace blob_db
 }  // namespace rocksdb
 #endif  // !ROCKSDB_LITE
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index afb953df9c5..a538ad0f691 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -13,20 +13,20 @@
 #include <string>
 #include <vector>
 
+#include "db/blob_index.h"
 #include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "file/sst_file_manager_impl.h"
 #include "port/port.h"
 #include "rocksdb/utilities/debug.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
 #include "util/cast_util.h"
-#include "util/fault_injection_test_env.h"
-#include "util/file_util.h"
 #include "util/random.h"
-#include "util/sst_file_manager_impl.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/blob_db/blob_db_impl.h"
-#include "utilities/blob_db/blob_index.h"
 
 namespace rocksdb {
 namespace blob_db {
@@ -999,6 +999,7 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   // Path should be relative to db_name, but begin with slash.
   std::string filename = "/blob_dir/000001.blob";
   ASSERT_EQ(filename, metadata[0].name);
+  ASSERT_EQ(1, metadata[0].file_number);
   ASSERT_EQ("default", metadata[0].column_family_name);
   std::vector<std::string> livefile;
   uint64_t mfs;
@@ -1613,6 +1614,235 @@ TEST_F(BlobDBTest, DisableFileDeletions) {
   }
 }
 
+TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
+  BlobDBOptions bdb_options;
+  bdb_options.enable_garbage_collection = true;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+
+  // Register some dummy blob files.
+  blob_db_impl()->TEST_AddDummyBlobFile(1, /* immutable_sequence */ 200);
+  blob_db_impl()->TEST_AddDummyBlobFile(2, /* immutable_sequence */ 300);
+  blob_db_impl()->TEST_AddDummyBlobFile(3, /* immutable_sequence */ 400);
+  blob_db_impl()->TEST_AddDummyBlobFile(4, /* immutable_sequence */ 500);
+  blob_db_impl()->TEST_AddDummyBlobFile(5, /* immutable_sequence */ 600);
+
+  // Initialize the blob <-> SST file mapping. First, add some SST files with
+  // blob file references, then some without.
+  std::vector<LiveFileMetaData> live_files;
+
+  for (uint64_t i = 1; i <= 10; ++i) {
+    LiveFileMetaData live_file;
+    live_file.file_number = i;
+    live_file.oldest_blob_file_number = ((i - 1) % 5) + 1;
+
+    live_files.emplace_back(live_file);
+  }
+
+  for (uint64_t i = 11; i <= 20; ++i) {
+    LiveFileMetaData live_file;
+    live_file.file_number = i;
+
+    live_files.emplace_back(live_file);
+  }
+
+  blob_db_impl()->TEST_InitializeBlobFileToSstMapping(live_files);
+
+  // Check that the blob <-> SST mappings have been correctly initialized.
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+
+  ASSERT_EQ(blob_files.size(), 5);
+
+  {
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+  }
+
+  {
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{false, false, false, false,
+                                              false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+  }
+
+  // Simulate a flush where the SST does not reference any blob files.
+  {
+    FlushJobInfo info{};
+    info.file_number = 21;
+    info.smallest_seqno = 1;
+    info.largest_seqno = 100;
+
+    blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{false, false, false, false,
+                                              false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+  }
+
+  // Simulate a flush where the SST references a blob file.
+  {
+    FlushJobInfo info{};
+    info.file_number = 22;
+    info.oldest_blob_file_number = 5;
+    info.smallest_seqno = 101;
+    info.largest_seqno = 200;
+
+    blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10, 22}};
+    const std::vector<bool> expected_obsolete{false, false, false, false,
+                                              false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 5);
+    for (size_t i = 0; i < 5; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 1);
+    }
+
+    ASSERT_TRUE(blob_db_impl()->TEST_GetObsoleteFiles().empty());
+  }
+
+  // Simulate a compaction. Some inputs and outputs have blob file references,
+  // some don't. There is also a trivial move (which means the SST appears on
+  // both the input and the output list). Blob file 1 loses all its linked SSTs,
+  // and since it got marked immutable at sequence number 200 which has already
+  // been flushed, it can be marked obsolete.
+  {
+    CompactionJobInfo info{};
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 1, 1});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 2, 2});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 6, 1});
+    info.input_file_infos.emplace_back(
+        CompactionFileInfo{1, 11, kInvalidBlobFileNumber});
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 22, 5});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 23, 3});
+    info.output_file_infos.emplace_back(
+        CompactionFileInfo{2, 24, kInvalidBlobFileNumber});
+
+    blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
+    const std::vector<bool> expected_obsolete{true, false, false, false, false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 4);
+    for (size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 1);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+  }
+
+  // Simulate another compaction. Blob file 2 loses all its linked SSTs
+  // but since it got marked immutable at sequence number 300 which hasn't
+  // been flushed yet, it cannot be marked obsolete at this point.
+  {
+    CompactionJobInfo info{};
+    info.input_file_infos.emplace_back(CompactionFileInfo{1, 7, 2});
+    info.input_file_infos.emplace_back(CompactionFileInfo{2, 22, 5});
+    info.output_file_infos.emplace_back(CompactionFileInfo{2, 25, 3});
+
+    blob_db_impl()->TEST_ProcessCompactionJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{true, false, false, false, false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 4);
+    for (size_t i = 0; i < 4; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 2);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 1);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+  }
+
+  // Simulate a flush with largest sequence number 300. This will make it
+  // possible to mark blob file 2 obsolete.
+  {
+    FlushJobInfo info{};
+    info.file_number = 26;
+    info.smallest_seqno = 201;
+    info.largest_seqno = 300;
+
+    blob_db_impl()->TEST_ProcessFlushJobInfo(info);
+
+    const std::vector<std::unordered_set<uint64_t>> expected_sst_files{
+        {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
+    const std::vector<bool> expected_obsolete{true, true, false, false, false};
+    for (size_t i = 0; i < 5; ++i) {
+      const auto &blob_file = blob_files[i];
+      ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
+      ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
+    }
+
+    auto live_imm_files = blob_db_impl()->TEST_GetLiveImmNonTTLFiles();
+    ASSERT_EQ(live_imm_files.size(), 3);
+    for (size_t i = 0; i < 3; ++i) {
+      ASSERT_EQ(live_imm_files[i]->BlobFileNumber(), i + 3);
+    }
+
+    auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+    ASSERT_EQ(obsolete_files.size(), 2);
+    ASSERT_EQ(obsolete_files[0]->BlobFileNumber(), 1);
+    ASSERT_EQ(obsolete_files[1]->BlobFileNumber(), 2);
+  }
+}
+
 TEST_F(BlobDBTest, ShutdownWait) {
   BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 100;
diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc
index 37eee19dbe1..9905ce12e86 100644
--- a/utilities/blob_db/blob_dump_tool.cc
+++ b/utilities/blob_db/blob_dump_tool.cc
@@ -4,22 +4,19 @@
 //  (found in the LICENSE.Apache file in the root directory).
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/blob_db/blob_dump_tool.h"
-#include <inttypes.h>
 #include <stdio.h>
+#include <cinttypes>
 #include <iostream>
 #include <memory>
 #include <string>
+#include "file/random_access_file_reader.h"
+#include "file/readahead_raf.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "table/format.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
diff --git a/utilities/blob_db/blob_dump_tool.h b/utilities/blob_db/blob_dump_tool.h
index ff4672fd3f3..39f83737d46 100644
--- a/utilities/blob_db/blob_dump_tool.h
+++ b/utilities/blob_db/blob_dump_tool.h
@@ -8,9 +8,9 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include "file/random_access_file_reader.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
-#include "util/file_reader_writer.h"
 #include "utilities/blob_db/blob_log_format.h"
 
 namespace rocksdb {
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index 3bcbd048734..f7a0e7de87a 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -1,3 +1,4 @@
+
 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
@@ -5,63 +6,42 @@
 #ifndef ROCKSDB_LITE
 #include "utilities/blob_db/blob_file.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <stdio.h>
+#include <cinttypes>
 
 #include <algorithm>
-#include <limits>
 #include <memory>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
-#include "util/filename.h"
-#include "util/logging.h"
+#include "file/filename.h"
+#include "file/readahead_raf.h"
+#include "logging/logging.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace rocksdb {
 
 namespace blob_db {
 
-BlobFile::BlobFile()
-    : parent_(nullptr),
-      file_number_(0),
-      info_log_(nullptr),
-      column_family_id_(std::numeric_limits<uint32_t>::max()),
-      compression_(kNoCompression),
-      has_ttl_(false),
-      blob_count_(0),
-      file_size_(0),
-      closed_(false),
-      obsolete_(false),
-      expiration_range_({0, 0}),
-      last_access_(-1),
-      last_fsync_(0),
-      header_valid_(false),
-      footer_valid_(false) {}
-
 BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
                    Logger* info_log)
+    : parent_(p), path_to_dir_(bdir), file_number_(fn), info_log_(info_log) {}
+
+BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
+                   Logger* info_log, uint32_t column_family_id,
+                   CompressionType compression, bool has_ttl,
+                   const ExpirationRange& expiration_range)
     : parent_(p),
       path_to_dir_(bdir),
       file_number_(fn),
       info_log_(info_log),
-      column_family_id_(std::numeric_limits<uint32_t>::max()),
-      compression_(kNoCompression),
-      has_ttl_(false),
-      blob_count_(0),
-      file_size_(0),
-      closed_(false),
-      obsolete_(false),
-      expiration_range_({0, 0}),
-      last_access_(-1),
-      last_fsync_(0),
-      header_valid_(false),
-      footer_valid_(false) {}
+      column_family_id_(column_family_id),
+      compression_(compression),
+      has_ttl_(has_ttl),
+      expiration_range_(expiration_range),
+      header_(column_family_id, compression, has_ttl, expiration_range),
+      header_valid_(true) {}
 
 BlobFile::~BlobFile() {
   if (obsolete_) {
@@ -127,7 +107,7 @@ bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
                 : (file_size_ - last_fsync_) >= bytes_per_sync;
 }
 
-Status BlobFile::WriteFooterAndCloseLocked() {
+Status BlobFile::WriteFooterAndCloseLocked(SequenceNumber sequence) {
   BlobLogFooter footer;
   footer.blob_count = blob_count_;
   if (HasTTL()) {
@@ -138,6 +118,7 @@ Status BlobFile::WriteFooterAndCloseLocked() {
   Status s = log_writer_->AppendFooter(footer);
   if (s.ok()) {
     closed_ = true;
+    immutable_sequence_ = sequence;
     file_size_ += BlobLogFooter::kSize;
   }
   // delete the sequential writer
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
index 668a0372286..485b2e98533 100644
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@@ -6,12 +6,14 @@
 #ifndef ROCKSDB_LITE
 
 #include <atomic>
+#include <limits>
 #include <memory>
+#include <unordered_set>
 
+#include "file/random_access_file_reader.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "util/file_reader_writer.h"
 #include "utilities/blob_db/blob_log_format.h"
 #include "utilities/blob_db/blob_log_reader.h"
 #include "utilities/blob_db/blob_log_writer.h"
@@ -28,7 +30,7 @@ class BlobFile {
 
  private:
   // access to parent
-  const BlobDBImpl* parent_;
+  const BlobDBImpl* parent_{nullptr};
 
   // path to blob directory
   std::string path_to_dir_;
@@ -36,42 +38,50 @@ class BlobFile {
   // the id of the file.
   // the above 2 are created during file creation and never changed
   // after that
-  uint64_t file_number_;
+  uint64_t file_number_{0};
+
+  // The file numbers of the SST files whose oldest blob file reference
+  // points to this blob file.
+  std::unordered_set<uint64_t> linked_sst_files_;
 
   // Info log.
-  Logger* info_log_;
+  Logger* info_log_{nullptr};
 
   // Column family id.
-  uint32_t column_family_id_;
+  uint32_t column_family_id_{std::numeric_limits<uint32_t>::max()};
 
   // Compression type of blobs in the file
-  CompressionType compression_;
+  CompressionType compression_{kNoCompression};
 
   // If true, the keys in this file all has TTL. Otherwise all keys don't
   // have TTL.
-  bool has_ttl_;
+  bool has_ttl_{false};
+
+  // TTL range of blobs in the file.
+  ExpirationRange expiration_range_;
 
   // number of blobs in the file
-  std::atomic<uint64_t> blob_count_;
+  std::atomic<uint64_t> blob_count_{0};
 
   // size of the file
-  std::atomic<uint64_t> file_size_;
+  std::atomic<uint64_t> file_size_{0};
 
   BlobLogHeader header_;
 
   // closed_ = true implies the file is no more mutable
   // no more blobs will be appended and the footer has been written out
-  std::atomic<bool> closed_;
+  std::atomic<bool> closed_{false};
+
+  // The latest sequence number when the file was closed/made immutable.
+  SequenceNumber immutable_sequence_{0};
 
   // has a pass of garbage collection successfully finished on this file
   // obsolete_ still needs to do iterator/snapshot checks
-  std::atomic<bool> obsolete_;
+  std::atomic<bool> obsolete_{false};
 
   // The last sequence number by the time the file marked as obsolete.
   // Data in this file is visible to a snapshot taken before the sequence.
-  SequenceNumber obsolete_sequence_;
-
-  ExpirationRange expiration_range_;
+  SequenceNumber obsolete_sequence_{0};
 
   // Sequential/Append writer for blobs
   std::shared_ptr<Writer> log_writer_;
@@ -84,31 +94,30 @@ class BlobFile {
   mutable port::RWMutex mutex_;
 
   // time when the random access reader was last created.
-  std::atomic<std::int64_t> last_access_;
+  std::atomic<std::int64_t> last_access_{-1};
 
   // last time file was fsync'd/fdatasyncd
-  std::atomic<uint64_t> last_fsync_;
+  std::atomic<uint64_t> last_fsync_{0};
 
-  bool header_valid_;
+  bool header_valid_{false};
 
-  bool footer_valid_;
-
-  SequenceNumber garbage_collection_finish_sequence_;
+  bool footer_valid_{false};
 
  public:
-  BlobFile();
+  BlobFile() = default;
 
   BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
            Logger* info_log);
 
+  BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
+           Logger* info_log, uint32_t column_family_id,
+           CompressionType compression, bool has_ttl,
+           const ExpirationRange& expiration_range);
+
   ~BlobFile();
 
   uint32_t column_family_id() const;
 
-  void SetColumnFamilyId(uint32_t cf_id) {
-    column_family_id_ = cf_id;
-  }
-
   // Returns log file's absolute pathname.
   std::string PathName() const;
 
@@ -116,6 +125,25 @@ class BlobFile {
   // once the file is created, this never changes
   uint64_t BlobFileNumber() const { return file_number_; }
 
+  // Get the set of SST files whose oldest blob file reference points to
+  // this file.
+  const std::unordered_set<uint64_t>& GetLinkedSstFiles() const {
+    return linked_sst_files_;
+  }
+
+  // Link an SST file whose oldest blob file reference points to this file.
+  void LinkSstFile(uint64_t sst_file_number) {
+    assert(linked_sst_files_.find(sst_file_number) == linked_sst_files_.end());
+    linked_sst_files_.insert(sst_file_number);
+  }
+
+  // Unlink an SST file whose oldest blob file reference points to this file.
+  void UnlinkSstFile(uint64_t sst_file_number) {
+    auto it = linked_sst_files_.find(sst_file_number);
+    assert(it != linked_sst_files_.end());
+    linked_sst_files_.erase(it);
+  }
+
   // the following functions are atomic, and don't need
   // read lock
   uint64_t BlobCount() const {
@@ -129,7 +157,15 @@ class BlobFile {
 
   // Mark the file as immutable.
   // REQUIRES: write lock held, or access from single thread (on DB open).
-  void MarkImmutable() { closed_ = true; }
+  void MarkImmutable(SequenceNumber sequence) {
+    closed_ = true;
+    immutable_sequence_ = sequence;
+  }
+
+  SequenceNumber GetImmutableSequence() const {
+    assert(Immutable());
+    return immutable_sequence_;
+  }
 
   // if the file has gone through GC and blobs have been relocated
   bool Obsolete() const {
@@ -170,10 +206,6 @@ class BlobFile {
 
   CompressionType compression() const { return compression_; }
 
-  void SetCompression(CompressionType c) {
-    compression_ = c;
-  }
-
   std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
 
   // Read blob file header and footer. Return corruption if file header is
@@ -192,7 +224,7 @@ class BlobFile {
 
   Status ReadFooter(BlobLogFooter* footer);
 
-  Status WriteFooterAndCloseLocked();
+  Status WriteFooterAndCloseLocked(SequenceNumber sequence);
 
   void CloseRandomAccessLocked();
 
diff --git a/utilities/blob_db/blob_log_format.h b/utilities/blob_db/blob_log_format.h
index fcc042f06db..34eacd5e025 100644
--- a/utilities/blob_db/blob_log_format.h
+++ b/utilities/blob_db/blob_log_format.h
@@ -43,11 +43,19 @@ using ExpirationRange = std::pair<uint64_t, uint64_t>;
 struct BlobLogHeader {
   static constexpr size_t kSize = 30;
 
+  BlobLogHeader() = default;
+  BlobLogHeader(uint32_t _column_family_id, CompressionType _compression,
+                bool _has_ttl, const ExpirationRange& _expiration_range)
+      : column_family_id(_column_family_id),
+        compression(_compression),
+        has_ttl(_has_ttl),
+        expiration_range(_expiration_range) {}
+
   uint32_t version = kVersion1;
   uint32_t column_family_id = 0;
   CompressionType compression = kNoCompression;
   bool has_ttl = false;
-  ExpirationRange expiration_range = std::make_pair(0, 0);
+  ExpirationRange expiration_range;
 
   void EncodeTo(std::string* dst);
 
diff --git a/utilities/blob_db/blob_log_reader.cc b/utilities/blob_db/blob_log_reader.cc
index 8ffcc2fa1ea..2ede0a8f2f7 100644
--- a/utilities/blob_db/blob_log_reader.cc
+++ b/utilities/blob_db/blob_log_reader.cc
@@ -9,8 +9,8 @@
 
 #include <algorithm>
 
+#include "file/random_access_file_reader.h"
 #include "monitoring/statistics.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
diff --git a/utilities/blob_db/blob_log_reader.h b/utilities/blob_db/blob_log_reader.h
index 45e2e955145..6c3990e3041 100644
--- a/utilities/blob_db/blob_log_reader.h
+++ b/utilities/blob_db/blob_log_reader.h
@@ -10,11 +10,11 @@
 #include <memory>
 #include <string>
 
+#include "file/random_access_file_reader.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
-#include "util/file_reader_writer.h"
 #include "utilities/blob_db/blob_log_format.h"
 
 namespace rocksdb {
@@ -42,13 +42,12 @@ class Reader {
   // "*file" must remain live while this Reader is in use.
   Reader(std::unique_ptr<RandomAccessFileReader>&& file_reader, Env* env,
          Statistics* statistics);
-
-  ~Reader() = default;
-
   // No copying allowed
   Reader(const Reader&) = delete;
   Reader& operator=(const Reader&) = delete;
 
+  ~Reader() = default;
+
   Status ReadHeader(BlobLogHeader* header);
 
   // Read the next record into *record.  Returns true if read
diff --git a/utilities/blob_db/blob_log_writer.cc b/utilities/blob_db/blob_log_writer.cc
index 51578c5c322..5087f67e797 100644
--- a/utilities/blob_db/blob_log_writer.cc
+++ b/utilities/blob_db/blob_log_writer.cc
@@ -9,10 +9,10 @@
 #include <cstdint>
 #include <string>
 
+#include "file/writable_file_writer.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/env.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "utilities/blob_db/blob_log_format.h"
 
diff --git a/utilities/blob_db/blob_log_writer.h b/utilities/blob_db/blob_log_writer.h
index dccac355cb2..faf2cfd26cd 100644
--- a/utilities/blob_db/blob_log_writer.h
+++ b/utilities/blob_db/blob_log_writer.h
@@ -39,13 +39,12 @@ class Writer {
   Writer(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
          Statistics* statistics, uint64_t log_number, uint64_t bpsync,
          bool use_fsync, uint64_t boffset = 0);
-
-  ~Writer() = default;
-
   // No copying allowed
   Writer(const Writer&) = delete;
   Writer& operator=(const Writer&) = delete;
 
+  ~Writer() = default;
+
   static void ConstructBlobHeader(std::string* buf, const Slice& key,
                                   const Slice& val, uint64_t expiration);
 
diff --git a/utilities/cassandra/cassandra_format_test.cc b/utilities/cassandra/cassandra_format_test.cc
index 8f9baa72357..7af21247eb1 100644
--- a/utilities/cassandra/cassandra_format_test.cc
+++ b/utilities/cassandra/cassandra_format_test.cc
@@ -5,7 +5,7 @@
 
 #include <cstring>
 #include <memory>
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/cassandra/format.h"
 #include "utilities/cassandra/serialize.h"
 #include "utilities/cassandra/test_utils.h"
diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc
index dacc6f03ce3..cec9ce7d88f 100644
--- a/utilities/cassandra/cassandra_functional_test.cc
+++ b/utilities/cassandra/cassandra_functional_test.cc
@@ -4,16 +4,16 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <iostream>
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
-#include "db/db_impl.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
-#include "utilities/merge_operators.h"
 #include "utilities/cassandra/cassandra_compaction_filter.h"
 #include "utilities/cassandra/merge_operator.h"
 #include "utilities/cassandra/test_utils.h"
+#include "utilities/merge_operators.h"
 
 using namespace rocksdb;
 
diff --git a/utilities/cassandra/cassandra_row_merge_test.cc b/utilities/cassandra/cassandra_row_merge_test.cc
index 8d6dc10ded0..88dee118b5b 100644
--- a/utilities/cassandra/cassandra_row_merge_test.cc
+++ b/utilities/cassandra/cassandra_row_merge_test.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <memory>
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/cassandra/format.h"
 #include "utilities/cassandra/test_utils.h"
 
diff --git a/utilities/cassandra/cassandra_serialize_test.cc b/utilities/cassandra/cassandra_serialize_test.cc
index 68d2c163d96..bfce2a36e30 100644
--- a/utilities/cassandra/cassandra_serialize_test.cc
+++ b/utilities/cassandra/cassandra_serialize_test.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/cassandra/serialize.h"
 
 using namespace rocksdb::cassandra;
diff --git a/utilities/cassandra/format.h b/utilities/cassandra/format.h
index 09a4923565f..b7f6e32f6ba 100644
--- a/utilities/cassandra/format.h
+++ b/utilities/cassandra/format.h
@@ -56,11 +56,11 @@
 
 #pragma once
 #include <chrono>
-#include <vector>
 #include <memory>
+#include <vector>
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 namespace cassandra {
diff --git a/utilities/cassandra/test_utils.h b/utilities/cassandra/test_utils.h
index 80374b0cbab..f58bd730015 100644
--- a/utilities/cassandra/test_utils.h
+++ b/utilities/cassandra/test_utils.h
@@ -5,7 +5,7 @@
 
 #pragma once
 #include <memory>
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 #include "utilities/cassandra/format.h"
 #include "utilities/cassandra/serialize.h"
 
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index 9863ac1d564..35344a988ad 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -11,24 +11,21 @@
 
 #include "utilities/checkpoint/checkpoint_impl.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
+#include <cinttypes>
 #include <string>
 #include <vector>
 
 #include "db/wal_manager.h"
+#include "file/file_util.h"
+#include "file/filename.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/utilities/checkpoint.h"
-#include "util/file_util.h"
-#include "util/filename.h"
-#include "util/sync_point.h"
+#include "test_util/sync_point.h"
 
 namespace rocksdb {
 
@@ -64,6 +61,12 @@ void CheckpointImpl::CleanStagingDirectory(
                  full_private_path.c_str(), s.ToString().c_str());
 }
 
+Status Checkpoint::ExportColumnFamily(
+    ColumnFamilyHandle* /*handle*/, const std::string& /*export_dir*/,
+    ExportImportFilesMetaData** /*metadata*/) {
+  return Status::NotSupported("");
+}
+
 // Builds an openable snapshot of RocksDB
 Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
                                         uint64_t log_size_for_flush) {
@@ -326,6 +329,187 @@ Status CheckpointImpl::CreateCustomCheckpoint(
   return s;
 }
 
+// Exports all live SST files of a specified Column Family onto export_dir,
+// returning SST files information in metadata.
+Status CheckpointImpl::ExportColumnFamily(
+    ColumnFamilyHandle* handle, const std::string& export_dir,
+    ExportImportFilesMetaData** metadata) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(handle);
+  const auto cf_name = cfh->GetName();
+  const auto db_options = db_->GetDBOptions();
+
+  assert(metadata != nullptr);
+  assert(*metadata == nullptr);
+  auto s = db_->GetEnv()->FileExists(export_dir);
+  if (s.ok()) {
+    return Status::InvalidArgument("Specified export_dir exists");
+  } else if (!s.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+
+  const auto final_nonslash_idx = export_dir.find_last_not_of('/');
+  if (final_nonslash_idx == std::string::npos) {
+    return Status::InvalidArgument("Specified export_dir invalid");
+  }
+  ROCKS_LOG_INFO(db_options.info_log,
+                 "[%s] export column family onto export directory %s",
+                 cf_name.c_str(), export_dir.c_str());
+
+  // Create a temporary export directory.
+  const auto tmp_export_dir =
+      export_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
+  s = db_->GetEnv()->CreateDir(tmp_export_dir);
+
+  if (s.ok()) {
+    s = db_->Flush(rocksdb::FlushOptions(), handle);
+  }
+
+  ColumnFamilyMetaData db_metadata;
+  if (s.ok()) {
+    // Export live sst files with file deletions disabled.
+    s = db_->DisableFileDeletions();
+    if (s.ok()) {
+      db_->GetColumnFamilyMetaData(handle, &db_metadata);
+
+      s = ExportFilesInMetaData(
+          db_options, db_metadata,
+          [&](const std::string& src_dirname, const std::string& fname) {
+            ROCKS_LOG_INFO(db_options.info_log, "[%s] HardLinking %s",
+                           cf_name.c_str(), fname.c_str());
+            return db_->GetEnv()->LinkFile(src_dirname + fname,
+                                           tmp_export_dir + fname);
+          } /*link_file_cb*/,
+          [&](const std::string& src_dirname, const std::string& fname) {
+            ROCKS_LOG_INFO(db_options.info_log, "[%s] Copying %s",
+                           cf_name.c_str(), fname.c_str());
+            return CopyFile(db_->GetEnv(), src_dirname + fname,
+                            tmp_export_dir + fname, 0, db_options.use_fsync);
+          } /*copy_file_cb*/);
+
+      const auto enable_status = db_->EnableFileDeletions(false /*force*/);
+      if (s.ok()) {
+        s = enable_status;
+      }
+    }
+  }
+
+  auto moved_to_user_specified_dir = false;
+  if (s.ok()) {
+    // Move temporary export directory to the actual export directory.
+    s = db_->GetEnv()->RenameFile(tmp_export_dir, export_dir);
+  }
+
+  if (s.ok()) {
+    // Fsync export directory.
+    moved_to_user_specified_dir = true;
+    std::unique_ptr<Directory> dir_ptr;
+    s = db_->GetEnv()->NewDirectory(export_dir, &dir_ptr);
+    if (s.ok()) {
+      assert(dir_ptr != nullptr);
+      s = dir_ptr->Fsync();
+    }
+  }
+
+  if (s.ok()) {
+    // Export of files succeeded. Fill in the metadata information.
+    auto result_metadata = new ExportImportFilesMetaData();
+    result_metadata->db_comparator_name = handle->GetComparator()->Name();
+    for (const auto& level_metadata : db_metadata.levels) {
+      for (const auto& file_metadata : level_metadata.files) {
+        LiveFileMetaData live_file_metadata;
+        live_file_metadata.size = file_metadata.size;
+        live_file_metadata.name = std::move(file_metadata.name);
+        live_file_metadata.file_number = file_metadata.file_number;
+        live_file_metadata.db_path = export_dir;
+        live_file_metadata.smallest_seqno = file_metadata.smallest_seqno;
+        live_file_metadata.largest_seqno = file_metadata.largest_seqno;
+        live_file_metadata.smallestkey = std::move(file_metadata.smallestkey);
+        live_file_metadata.largestkey = std::move(file_metadata.largestkey);
+        live_file_metadata.oldest_blob_file_number =
+            file_metadata.oldest_blob_file_number;
+        live_file_metadata.level = level_metadata.level;
+        result_metadata->files.push_back(live_file_metadata);
+      }
+      *metadata = result_metadata;
+    }
+    ROCKS_LOG_INFO(db_options.info_log, "[%s] Export succeeded.",
+                   cf_name.c_str());
+  } else {
+    // Failure: Clean up all the files/directories created.
+    ROCKS_LOG_INFO(db_options.info_log, "[%s] Export failed. %s",
+                   cf_name.c_str(), s.ToString().c_str());
+    std::vector<std::string> subchildren;
+    const auto cleanup_dir =
+        moved_to_user_specified_dir ? export_dir : tmp_export_dir;
+    db_->GetEnv()->GetChildren(cleanup_dir, &subchildren);
+    for (const auto& subchild : subchildren) {
+      const auto subchild_path = cleanup_dir + "/" + subchild;
+      const auto status = db_->GetEnv()->DeleteFile(subchild_path);
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup file %s: %s",
+                       subchild_path.c_str(), status.ToString().c_str());
+      }
+    }
+    const auto status = db_->GetEnv()->DeleteDir(cleanup_dir);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options.info_log, "Failed to cleanup dir %s: %s",
+                     cleanup_dir.c_str(), status.ToString().c_str());
+    }
+  }
+  return s;
+}
+
+Status CheckpointImpl::ExportFilesInMetaData(
+    const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
+    std::function<Status(const std::string& src_dirname,
+                         const std::string& src_fname)>
+        link_file_cb,
+    std::function<Status(const std::string& src_dirname,
+                         const std::string& src_fname)>
+        copy_file_cb) {
+  Status s;
+  auto hardlink_file = true;
+
+  // Copy/hard link files in metadata.
+  size_t num_files = 0;
+  for (const auto& level_metadata : metadata.levels) {
+    for (const auto& file_metadata : level_metadata.files) {
+      uint64_t number;
+      FileType type;
+      const auto ok = ParseFileName(file_metadata.name, &number, &type);
+      if (!ok) {
+        s = Status::Corruption("Could not parse file name");
+        break;
+      }
+
+      // We should only get sst files here.
+      assert(type == kTableFile);
+      assert(file_metadata.size > 0 && file_metadata.name[0] == '/');
+      const auto src_fname = file_metadata.name;
+      ++num_files;
+
+      if (hardlink_file) {
+        s = link_file_cb(db_->GetName(), src_fname);
+        if (num_files == 1 && s.IsNotSupported()) {
+          // Fallback to copy if link failed due to cross-device directories.
+          hardlink_file = false;
+          s = Status::OK();
+        }
+      }
+      if (!hardlink_file) {
+        s = copy_file_cb(db_->GetName(), src_fname);
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  ROCKS_LOG_INFO(db_options.info_log, "Number of table files %" ROCKSDB_PRIszt,
+                 num_files);
+
+  return s;
+}
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h
index a85fde59b60..0d87b635b8d 100644
--- a/utilities/checkpoint/checkpoint_impl.h
+++ b/utilities/checkpoint/checkpoint_impl.h
@@ -9,8 +9,8 @@
 #include "rocksdb/utilities/checkpoint.h"
 
 #include <string>
+#include "file/filename.h"
 #include "rocksdb/db.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 
@@ -30,6 +30,17 @@ class CheckpointImpl : public Checkpoint {
   virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
                                   uint64_t log_size_for_flush) override;
 
+  // Exports all live SST files of a specified Column Family onto export_dir
+  // and returning SST files information in metadata.
+  //  - SST files will be created as hard links when the directory specified
+  //    is in the same partition as the db directory, copied otherwise.
+  //  - export_dir should not already exist and will be created by this API.
+  //  - Always triggers a flush.
+  using Checkpoint::ExportColumnFamily;
+  virtual Status ExportColumnFamily(
+      ColumnFamilyHandle* handle, const std::string& export_dir,
+      ExportImportFilesMetaData** metadata) override;
+
   // Checkpoint logic can be customized by providing callbacks for link, copy,
   // or create.
   Status CreateCustomCheckpoint(
@@ -48,6 +59,18 @@ class CheckpointImpl : public Checkpoint {
 
  private:
   void CleanStagingDirectory(const std::string& path, Logger* info_log);
+
+  // Export logic customization by providing callbacks for link or copy.
+  Status ExportFilesInMetaData(
+      const DBOptions& db_options, const ColumnFamilyMetaData& metadata,
+      std::function<Status(const std::string& src_dirname,
+                           const std::string& fname)>
+          link_file_cb,
+      std::function<Status(const std::string& src_dirname,
+                           const std::string& fname)>
+          copy_file_cb);
+
+ private:
   DB* db_;
 };
 
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index 9318a733dcf..d7410960a44 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -16,16 +16,17 @@
 #include <iostream>
 #include <thread>
 #include <utility>
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/transaction_db.h"
-#include "util/fault_injection_test_env.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 
 namespace rocksdb {
 class CheckpointTest : public testing::Test {
@@ -44,6 +45,9 @@ class CheckpointTest : public testing::Test {
   Options last_options_;
   std::vector<ColumnFamilyHandle*> handles_;
   std::string snapshot_name_;
+  std::string export_path_;
+  ColumnFamilyHandle* cfh_reverse_comp_;
+  ExportImportFilesMetaData* metadata_;
 
   CheckpointTest() : env_(Env::Default()) {
     env_->SetBackgroundThreads(1, Env::LOW);
@@ -64,12 +68,24 @@ class CheckpointTest : public testing::Test {
     EXPECT_OK(DestroyDB(snapshot_tmp_name, options));
     env_->DeleteDir(snapshot_tmp_name);
     Reopen(options);
+    export_path_ = test::TmpDir(env_) + "/export";
+    test::DestroyDir(env_, export_path_);
+    cfh_reverse_comp_ = nullptr;
+    metadata_ = nullptr;
   }
 
   ~CheckpointTest() override {
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
     rocksdb::SyncPoint::GetInstance()->LoadDependency({});
     rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    if (cfh_reverse_comp_) {
+      EXPECT_OK(db_->DestroyColumnFamilyHandle(cfh_reverse_comp_));
+      cfh_reverse_comp_ = nullptr;
+    }
+    if (metadata_) {
+      delete metadata_;
+      metadata_ = nullptr;
+    }
     Close();
     Options options;
     options.db_paths.emplace_back(dbname_, 0);
@@ -78,6 +94,7 @@ class CheckpointTest : public testing::Test {
     options.db_paths.emplace_back(dbname_ + "_4", 0);
     EXPECT_OK(DestroyDB(dbname_, options));
     EXPECT_OK(DestroyDB(snapshot_name_, options));
+    test::DestroyDir(env_, export_path_);
   }
 
   // Return the current option configuration.
@@ -140,6 +157,12 @@ class CheckpointTest : public testing::Test {
     ASSERT_OK(TryReopen(options));
   }
 
+  void CompactAll() {
+    for (auto h : handles_) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), h, nullptr, nullptr));
+    }
+  }
+
   void Close() {
     for (auto h : handles_) {
       delete h;
@@ -289,6 +312,108 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
   }
 }
 
+TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
+  // Create a database
+  Status s;
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({}, options);
+
+  // Helper to verify the number of files in metadata and export dir
+  auto verify_files_exported = [&](const ExportImportFilesMetaData& metadata,
+                                   int num_files_expected) {
+    ASSERT_EQ(metadata.files.size(), num_files_expected);
+    std::vector<std::string> subchildren;
+    env_->GetChildren(export_path_, &subchildren);
+    int num_children = 0;
+    for (const auto& child : subchildren) {
+      if (child != "." && child != "..") {
+        ++num_children;
+      }
+    }
+    ASSERT_EQ(num_children, num_files_expected);
+  };
+
+  // Test DefaultColumnFamily
+  {
+    const auto key = std::string("foo");
+    ASSERT_OK(Put(key, "v1"));
+
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+    // Export the Tables and verify
+    ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                             export_path_, &metadata_));
+    verify_files_exported(*metadata_, 1);
+    ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
+    test::DestroyDir(env_, export_path_);
+    delete metadata_;
+    metadata_ = nullptr;
+
+    // Check again after compaction
+    CompactAll();
+    ASSERT_OK(Put(key, "v2"));
+    ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                             export_path_, &metadata_));
+    verify_files_exported(*metadata_, 2);
+    ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
+    test::DestroyDir(env_, export_path_);
+    delete metadata_;
+    metadata_ = nullptr;
+    delete checkpoint;
+  }
+
+  // Test non default column family with non default comparator
+  {
+    auto cf_options = CurrentOptions();
+    cf_options.comparator = ReverseBytewiseComparator();
+    ASSERT_OK(db_->CreateColumnFamily(cf_options, "yoyo", &cfh_reverse_comp_));
+
+    const auto key = std::string("foo");
+    ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1"));
+
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+    // Export the Tables and verify
+    ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_,
+                                             &metadata_));
+    verify_files_exported(*metadata_, 1);
+    ASSERT_EQ(metadata_->db_comparator_name,
+              ReverseBytewiseComparator()->Name());
+    delete checkpoint;
+  }
+}
+
+TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
+  // Create a database
+  Status s;
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({}, options);
+
+  const auto key = std::string("foo");
+  ASSERT_OK(Put(key, "v1"));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+  // Export onto existing directory
+  env_->CreateDirIfMissing(export_path_);
+  ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                           export_path_, &metadata_),
+            Status::InvalidArgument("Specified export_dir exists"));
+  test::DestroyDir(env_, export_path_);
+
+  // Export with invalid directory specification
+  export_path_ = "";
+  ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
+                                           export_path_, &metadata_),
+            Status::InvalidArgument("Specified export_dir invalid"));
+  delete checkpoint;
+}
+
 TEST_F(CheckpointTest, CheckpointCF) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options);
diff --git a/utilities/convenience/info_log_finder.cc b/utilities/convenience/info_log_finder.cc
index 72c4a6275ae..646362aa2c2 100644
--- a/utilities/convenience/info_log_finder.cc
+++ b/utilities/convenience/info_log_finder.cc
@@ -8,41 +8,18 @@
 // found in the LICENSE file.
 
 #include "rocksdb/utilities/info_log_finder.h"
+#include "file/filename.h"
 #include "rocksdb/env.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 
 Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list) {
-  uint64_t number = 0;
-  FileType type;
-  std::string path;
-
   if (!db) {
     return Status::InvalidArgument("DB pointer is not valid");
   }
-
+  std::string parent_path;
   const Options& options = db->GetOptions();
-  if (!options.db_log_dir.empty()) {
-    path = options.db_log_dir;
-  } else {
-    path = db->GetName();
-  }
-  InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), db->GetName());
-  auto* env = options.env;
-  std::vector<std::string> file_names;
-  Status s = env->GetChildren(path, &file_names);
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  for (auto f : file_names) {
-    if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
-        (type == kInfoLogFile)) {
-      info_log_list->push_back(f);
-    }
-  }
-  return Status::OK();
+  return GetInfoLogFiles(options.env, options.db_log_dir, db->GetName(),
+                         &parent_path, info_log_list);
 }
 }  // namespace rocksdb
diff --git a/utilities/debug.cc b/utilities/debug.cc
index 72fcbf0f54d..3c35f4c1122 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -7,23 +7,41 @@
 
 #include "rocksdb/utilities/debug.h"
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
 Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
                          size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions) {
-  assert(key_versions != nullptr);
+  if (nullptr == db) {
+    return Status::InvalidArgument("db cannot be null.");
+  }
+  return GetAllKeyVersions(db, db->DefaultColumnFamily(), begin_key, end_key,
+                           max_num_ikeys, key_versions);
+}
+
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
+                         Slice end_key, size_t max_num_ikeys,
+                         std::vector<KeyVersion>* key_versions) {
+  if (nullptr == db) {
+    return Status::InvalidArgument("db cannot be null.");
+  }
+  if (nullptr == cfh) {
+    return Status::InvalidArgument("Column family handle cannot be null.");
+  }
+  if (nullptr == key_versions) {
+    return Status::InvalidArgument("key_versions cannot be null.");
+  }
   key_versions->clear();
 
   DBImpl* idb = static_cast<DBImpl*>(db->GetRootDB());
-  auto icmp = InternalKeyComparator(idb->GetOptions().comparator);
+  auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator);
   ReadRangeDelAggregator range_del_agg(&icmp,
                                        kMaxSequenceNumber /* upper_bound */);
   Arena arena;
-  ScopedArenaIterator iter(
-      idb->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber));
+  ScopedArenaIterator iter(idb->NewInternalIterator(&arena, &range_del_agg,
+                                                    kMaxSequenceNumber, cfh));
 
   if (!begin_key.empty()) {
     InternalKey ikey;
diff --git a/utilities/env_librados_test.cc b/utilities/env_librados_test.cc
index 1a3746860b6..e5f91894599 100644
--- a/utilities/env_librados_test.cc
+++ b/utilities/env_librados_test.cc
@@ -9,7 +9,7 @@
 #include "rocksdb/utilities/env_librados.h"
 #include <rados/librados.hpp>
 #include "env/mock_env.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 #include "rocksdb/db.h"
 #include "rocksdb/slice.h"
diff --git a/utilities/env_mirror_test.cc b/utilities/env_mirror_test.cc
index 3c0ed228522..6b20f1f1334 100644
--- a/utilities/env_mirror_test.cc
+++ b/utilities/env_mirror_test.cc
@@ -8,7 +8,7 @@
 
 #include "rocksdb/utilities/env_mirror.h"
 #include "env/mock_env.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/utilities/env_timed_test.cc b/utilities/env_timed_test.cc
index 8bdef6396e0..989c79a391d 100644
--- a/utilities/env_timed_test.cc
+++ b/utilities/env_timed_test.cc
@@ -7,7 +7,7 @@
 
 #include "rocksdb/env.h"
 #include "rocksdb/perf_context.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc
index 8d976ef9214..9181ff7dfd7 100644
--- a/utilities/memory/memory_test.cc
+++ b/utilities/memory/memory_test.cc
@@ -5,15 +5,15 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/memory_util.h"
 #include "rocksdb/utilities/stackable_db.h"
-#include "table/block_based_table_factory.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -57,6 +57,8 @@ class MemoryTest : public testing::Test {
     cache_set->clear();
 
     for (auto* db : dbs) {
+      assert(db);
+
       // Cache from DBImpl
       StackableDB* sdb = dynamic_cast<StackableDB*>(db);
       DBImpl* db_impl = dynamic_cast<DBImpl*>(sdb ? sdb->GetBaseDB() : db);
diff --git a/utilities/memory/memory_util.cc b/utilities/memory/memory_util.cc
index 83bf33c1794..47ca4b7bb7d 100644
--- a/utilities/memory/memory_util.cc
+++ b/utilities/memory/memory_util.cc
@@ -7,7 +7,7 @@
 
 #include "rocksdb/utilities/memory_util.h"
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 
 namespace rocksdb {
 
diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h
index 4c720b822fe..6e3464bdfb4 100644
--- a/utilities/merge_operators.h
+++ b/utilities/merge_operators.h
@@ -23,6 +23,7 @@ class MergeOperators {
   static std::shared_ptr<MergeOperator> CreateStringAppendTESTOperator();
   static std::shared_ptr<MergeOperator> CreateMaxOperator();
   static std::shared_ptr<MergeOperator> CreateBytesXOROperator();
+  static std::shared_ptr<MergeOperator> CreateSortOperator();
 
   // Will return a different merge operator depending on the string.
   // TODO: Hook the "name" up to the actual Name() of the MergeOperators?
@@ -42,6 +43,8 @@ class MergeOperators {
       return CreateMaxOperator();
     } else if (name == "bytesxor") {
       return CreateBytesXOROperator();
+    } else if (name == "sortlist") {
+      return CreateSortOperator();
     } else {
       // Empty or unknown, just return nullptr
       return nullptr;
diff --git a/utilities/merge_operators/sortlist.cc b/utilities/merge_operators/sortlist.cc
new file mode 100644
index 00000000000..3d0de94f502
--- /dev/null
+++ b/utilities/merge_operators/sortlist.cc
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "utilities/merge_operators/sortlist.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "utilities/merge_operators.h"
+
+using rocksdb::Logger;
+using rocksdb::MergeOperator;
+using rocksdb::Slice;
+
+namespace rocksdb {
+
+bool SortList::FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const {
+  std::vector<int> left;
+  for (Slice slice : merge_in.operand_list) {
+    std::vector<int> right;
+    MakeVector(right, slice);
+    left = Merge(left, right);
+  }
+  for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) {
+    merge_out->new_value.append(std::to_string(left[i])).append(",");
+  }
+  merge_out->new_value.append(std::to_string(left.back()));
+  return true;
+}
+
+bool SortList::PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+                            const Slice& right_operand, std::string* new_value,
+                            Logger* /*logger*/) const {
+  std::vector<int> left;
+  std::vector<int> right;
+  MakeVector(left, left_operand);
+  MakeVector(right, right_operand);
+  left = Merge(left, right);
+  for (int i = 0; i < static_cast<int>(left.size()) - 1; i++) {
+    new_value->append(std::to_string(left[i])).append(",");
+  }
+  new_value->append(std::to_string(left.back()));
+  return true;
+}
+
+bool SortList::PartialMergeMulti(const Slice& /*key*/,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value,
+                                 Logger* /*logger*/) const {
+  (void)operand_list;
+  (void)new_value;
+  return true;
+}
+
+const char* SortList::Name() const { return "MergeSortOperator"; }
+
+void SortList::MakeVector(std::vector<int>& operand, Slice slice) const {
+  do {
+    const char* begin = slice.data_;
+    while (*slice.data_ != ',' && *slice.data_) slice.data_++;
+    operand.push_back(std::stoi(std::string(begin, slice.data_)));
+  } while (0 != *slice.data_++);
+}
+
+std::vector<int> SortList::Merge(std::vector<int>& left,
+                                 std::vector<int>& right) const {
+  // Fill the resultant vector with sorted results from both vectors
+  std::vector<int> result;
+  unsigned left_it = 0, right_it = 0;
+
+  while (left_it < left.size() && right_it < right.size()) {
+    // If the left value is smaller than the right it goes next
+    // into the resultant vector
+    if (left[left_it] < right[right_it]) {
+      result.push_back(left[left_it]);
+      left_it++;
+    } else {
+      result.push_back(right[right_it]);
+      right_it++;
+    }
+  }
+
+  // Push the remaining data from both vectors onto the resultant
+  while (left_it < left.size()) {
+    result.push_back(left[left_it]);
+    left_it++;
+  }
+
+  while (right_it < right.size()) {
+    result.push_back(right[right_it]);
+    right_it++;
+  }
+
+  return result;
+}
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateSortOperator() {
+  return std::make_shared<SortList>();
+}
+}  // namespace rocksdb
diff --git a/utilities/merge_operators/sortlist.h b/utilities/merge_operators/sortlist.h
new file mode 100644
index 00000000000..02c93edf5e9
--- /dev/null
+++ b/utilities/merge_operators/sortlist.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// A MergeOperator for RocksDB that implements Merge Sort.
+// It is built using the MergeOperator interface. The operator works by taking
+// an input which contains one or more merge operands where each operand is a
+// list of sorted ints and merges them to form a large sorted list.
+#pragma once
+
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class SortList : public MergeOperator {
+ public:
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override;
+
+  bool PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+                    const Slice& right_operand, std::string* new_value,
+                    Logger* /*logger*/) const override;
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value, Logger* logger) const override;
+
+  const char* Name() const override;
+
+  void MakeVector(std::vector<int>& operand, Slice slice) const;
+
+ private:
+  std::vector<int> Merge(std::vector<int>& left, std::vector<int>& right) const;
+};
+
+}  // namespace rocksdb
diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc
index 54c89a03abf..f0b83f621eb 100644
--- a/utilities/merge_operators/string_append/stringappend_test.cc
+++ b/utilities/merge_operators/string_append/stringappend_test.cc
@@ -12,11 +12,11 @@
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
-#include "util/testharness.h"
-#include "util/random.h"
 
 using namespace rocksdb;
 
diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc
index b998e1b8e4e..e71ecfd9a5b 100644
--- a/utilities/merge_operators/uint64add.cc
+++ b/utilities/merge_operators/uint64add.cc
@@ -5,11 +5,11 @@
 
 #include <memory>
 
+#include "logging/logging.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
 #include "util/coding.h"
-#include "util/logging.h"
 #include "utilities/merge_operators.h"
 
 using namespace rocksdb;
diff --git a/utilities/object_registry.cc b/utilities/object_registry.cc
new file mode 100644
index 00000000000..3706e791e00
--- /dev/null
+++ b/utilities/object_registry.cc
@@ -0,0 +1,87 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/utilities/object_registry.h"
+
+#include "logging/logging.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+#ifndef ROCKSDB_LITE
+// Looks through the "type" factories for one that matches "name".
+// If found, returns the pointer to the Entry matching this name.
+// Otherwise, nullptr is returned
+const ObjectLibrary::Entry *ObjectLibrary::FindEntry(
+    const std::string &type, const std::string &name) const {
+  auto entries = entries_.find(type);
+  if (entries != entries_.end()) {
+    for (const auto &entry : entries->second) {
+      if (entry->matches(name)) {
+        return entry.get();
+      }
+    }
+  }
+  return nullptr;
+}
+
+void ObjectLibrary::AddEntry(const std::string &type,
+                             std::unique_ptr<Entry> &entry) {
+  auto &entries = entries_[type];
+  entries.emplace_back(std::move(entry));
+}
+
+void ObjectLibrary::Dump(Logger *logger) const {
+  for (const auto &iter : entries_) {
+    ROCKS_LOG_HEADER(logger, "    Registered factories for type[%s] ",
+                     iter.first.c_str());
+    bool printed_one = false;
+    for (const auto &e : iter.second) {
+      ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':',
+                       e->Name().c_str());
+      printed_one = true;
+    }
+  }
+  ROCKS_LOG_HEADER(logger, "\n");
+}
+
+// Returns the Default singleton instance of the ObjectLibrary
+// This instance will contain most of the "standard" registered objects
+std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
+  static std::shared_ptr<ObjectLibrary> instance =
+      std::make_shared<ObjectLibrary>();
+  return instance;
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
+  std::shared_ptr<ObjectRegistry> instance = std::make_shared<ObjectRegistry>();
+  return instance;
+}
+
+ObjectRegistry::ObjectRegistry() {
+  libraries_.push_back(ObjectLibrary::Default());
+}
+
+// Searches (from back to front) the libraries looking for the
+// an entry that matches this pattern.
+// Returns the entry if it is found, and nullptr otherwise
+const ObjectLibrary::Entry *ObjectRegistry::FindEntry(
+    const std::string &type, const std::string &name) const {
+  for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
+    const auto *entry = iter->get()->FindEntry(type, name);
+    if (entry != nullptr) {
+      return entry;
+    }
+  }
+  return nullptr;
+}
+
+void ObjectRegistry::Dump(Logger *logger) const {
+  for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
+    iter->get()->Dump(logger);
+  }
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace rocksdb
diff --git a/utilities/object_registry_test.cc b/utilities/object_registry_test.cc
index 4444d8712f9..826931845dc 100644
--- a/utilities/object_registry_test.cc
+++ b/utilities/object_registry_test.cc
@@ -6,7 +6,7 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/object_registry.h"
-#include "util/testharness.h"
+#include "test_util/testharness.h"
 
 namespace rocksdb {
 
@@ -17,44 +17,145 @@ class EnvRegistryTest : public testing::Test {
 
 int EnvRegistryTest::num_a = 0;
 int EnvRegistryTest::num_b = 0;
+static FactoryFunc<Env> test_reg_a = ObjectLibrary::Default()->Register<Env>(
+    "a://.*",
+    [](const std::string& /*uri*/, std::unique_ptr<Env>* /*env_guard*/,
+       std::string* /* errmsg */) {
+      ++EnvRegistryTest::num_a;
+      return Env::Default();
+    });
 
-static Registrar<Env> test_reg_a("a://.*",
-                                 [](const std::string& /*uri*/,
-                                    std::unique_ptr<Env>* /*env_guard*/) {
-                                   ++EnvRegistryTest::num_a;
-                                   return Env::Default();
-                                 });
-
-static Registrar<Env> test_reg_b("b://.*", [](const std::string& /*uri*/,
-                                              std::unique_ptr<Env>* env_guard) {
-  ++EnvRegistryTest::num_b;
-  // Env::Default() is a singleton so we can't grant ownership directly to the
-  // caller - we must wrap it first.
-  env_guard->reset(new EnvWrapper(Env::Default()));
-  return env_guard->get();
-});
+static FactoryFunc<Env> test_reg_b = ObjectLibrary::Default()->Register<Env>(
+    "b://.*", [](const std::string& /*uri*/, std::unique_ptr<Env>* env_guard,
+                 std::string* /* errmsg */) {
+      ++EnvRegistryTest::num_b;
+      // Env::Default() is a singleton so we can't grant ownership directly to
+      // the caller - we must wrap it first.
+      env_guard->reset(new EnvWrapper(Env::Default()));
+      return env_guard->get();
+    });
 
 TEST_F(EnvRegistryTest, Basics) {
+  std::string msg;
   std::unique_ptr<Env> env_guard;
-  auto res = NewCustomObject<Env>("a://test", &env_guard);
+  auto registry = ObjectRegistry::NewInstance();
+  auto res = registry->NewObject<Env>("a://test", &env_guard, &msg);
   ASSERT_NE(res, nullptr);
   ASSERT_EQ(env_guard, nullptr);
   ASSERT_EQ(1, num_a);
   ASSERT_EQ(0, num_b);
 
-  res = NewCustomObject<Env>("b://test", &env_guard);
+  res = registry->NewObject<Env>("b://test", &env_guard, &msg);
   ASSERT_NE(res, nullptr);
   ASSERT_NE(env_guard, nullptr);
   ASSERT_EQ(1, num_a);
   ASSERT_EQ(1, num_b);
 
-  res = NewCustomObject<Env>("c://test", &env_guard);
+  res = registry->NewObject<Env>("c://test", &env_guard, &msg);
   ASSERT_EQ(res, nullptr);
   ASSERT_EQ(env_guard, nullptr);
   ASSERT_EQ(1, num_a);
   ASSERT_EQ(1, num_b);
 }
 
+TEST_F(EnvRegistryTest, LocalRegistry) {
+  std::string msg;
+  std::unique_ptr<Env> guard;
+  auto registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  registry->AddLibrary(library);
+  library->Register<Env>(
+      "test-local",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  ObjectLibrary::Default()->Register<Env>(
+      "test-global",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  ASSERT_EQ(
+      ObjectRegistry::NewInstance()->NewObject<Env>("test-local", &guard, &msg),
+      nullptr);
+  ASSERT_NE(
+      ObjectRegistry::NewInstance()->NewObject("test-global", &guard, &msg),
+      nullptr);
+  ASSERT_NE(registry->NewObject<Env>("test-local", &guard, &msg), nullptr);
+  ASSERT_NE(registry->NewObject<Env>("test-global", &guard, &msg), nullptr);
+}
+
+TEST_F(EnvRegistryTest, CheckShared) {
+  std::shared_ptr<Env> shared;
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  registry->AddLibrary(library);
+  library->Register<Env>(
+      "unguarded",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  library->Register<Env>(
+      "guarded", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+                    std::string* /* errmsg */) {
+        guard->reset(new EnvWrapper(Env::Default()));
+        return guard->get();
+      });
+
+  ASSERT_OK(registry->NewSharedObject<Env>("guarded", &shared));
+  ASSERT_NE(shared, nullptr);
+  shared.reset();
+  ASSERT_NOK(registry->NewSharedObject<Env>("unguarded", &shared));
+  ASSERT_EQ(shared, nullptr);
+}
+
+TEST_F(EnvRegistryTest, CheckStatic) {
+  Env* env = nullptr;
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  registry->AddLibrary(library);
+  library->Register<Env>(
+      "unguarded",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  library->Register<Env>(
+      "guarded", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+                    std::string* /* errmsg */) {
+        guard->reset(new EnvWrapper(Env::Default()));
+        return guard->get();
+      });
+
+  ASSERT_NOK(registry->NewStaticObject<Env>("guarded", &env));
+  ASSERT_EQ(env, nullptr);
+  env = nullptr;
+  ASSERT_OK(registry->NewStaticObject<Env>("unguarded", &env));
+  ASSERT_NE(env, nullptr);
+}
+
+TEST_F(EnvRegistryTest, CheckUnique) {
+  std::unique_ptr<Env> unique;
+  std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
+  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  registry->AddLibrary(library);
+  library->Register<Env>(
+      "unguarded",
+      [](const std::string& /*uri*/, std::unique_ptr<Env>* /*guard */,
+         std::string* /* errmsg */) { return Env::Default(); });
+
+  library->Register<Env>(
+      "guarded", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+                    std::string* /* errmsg */) {
+        guard->reset(new EnvWrapper(Env::Default()));
+        return guard->get();
+      });
+
+  ASSERT_OK(registry->NewUniqueObject<Env>("guarded", &unique));
+  ASSERT_NE(unique, nullptr);
+  unique.reset();
+  ASSERT_NOK(registry->NewUniqueObject<Env>("unguarded", &unique));
+  ASSERT_EQ(unique, nullptr);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/options/options_util.cc b/utilities/options/options_util.cc
index 3975eadd755..561e925ebbe 100644
--- a/utilities/options/options_util.cc
+++ b/utilities/options/options_util.cc
@@ -7,9 +7,9 @@
 
 #include "rocksdb/utilities/options_util.h"
 
+#include "file/filename.h"
 #include "options/options_parser.h"
 #include "rocksdb/options.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 Status LoadOptionsFromFile(const std::string& file_name, Env* env,
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index ed7bfdfd6f7..3926275af5e 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -4,11 +4,8 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
 
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <cctype>
 #include <unordered_map>
@@ -17,9 +14,9 @@
 #include "rocksdb/db.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/options_util.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
@@ -58,7 +55,7 @@ TEST_F(OptionsUtilTest, SaveAndLoad) {
     cf_names.push_back(i == 0 ? kDefaultColumnFamilyName
                               : test::RandomName(&rnd_, 10));
     cf_opts.emplace_back();
-    test::RandomInitCFOptions(&cf_opts.back(), &rnd_);
+    test::RandomInitCFOptions(&cf_opts.back(), db_opt, &rnd_);
   }
 
   const std::string kFileName = "OPTIONS-123456";
@@ -82,7 +79,7 @@ TEST_F(OptionsUtilTest, SaveAndLoad) {
           cf_opts[i].table_factory.get(),
           loaded_cf_descs[i].options.table_factory.get()));
     }
-    test::RandomInitCFOptions(&cf_opts[i], &rnd_);
+    test::RandomInitCFOptions(&cf_opts[i], db_opt, &rnd_);
     ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
         cf_opts[i], loaded_cf_descs[i].options));
   }
diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc
index f7f72df6dfc..2169f906955 100644
--- a/utilities/persistent_cache/block_cache_tier.cc
+++ b/utilities/persistent_cache/block_cache_tier.cc
@@ -10,10 +10,10 @@
 #include <utility>
 #include <vector>
 
+#include "logging/logging.h"
 #include "port/port.h"
-#include "util/logging.h"
+#include "test_util/sync_point.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
 #include "utilities/persistent_cache/block_cache_tier_file.h"
 
 namespace rocksdb {
diff --git a/utilities/persistent_cache/block_cache_tier.h b/utilities/persistent_cache/block_cache_tier.h
index 670463a87f9..00dd9a173e9 100644
--- a/utilities/persistent_cache/block_cache_tier.h
+++ b/utilities/persistent_cache/block_cache_tier.h
@@ -27,10 +27,10 @@
 #include "utilities/persistent_cache/block_cache_tier_metadata.h"
 #include "utilities/persistent_cache/persistent_cache_util.h"
 
+#include "memory/arena.h"
 #include "memtable/skiplist.h"
 #include "monitoring/histogram.h"
 #include "port/port.h"
-#include "util/arena.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/mutexlock.h"
diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc
index ce6335fb586..58671e2bb16 100644
--- a/utilities/persistent_cache/block_cache_tier_file.cc
+++ b/utilities/persistent_cache/block_cache_tier_file.cc
@@ -13,9 +13,9 @@
 #include <memory>
 #include <vector>
 
+#include "logging/logging.h"
 #include "port/port.h"
 #include "util/crc32c.h"
-#include "util/logging.h"
 
 namespace rocksdb {
 
@@ -34,6 +34,8 @@ Status NewWritableCacheFile(Env* const env, const std::string& filepath,
 Status NewRandomAccessCacheFile(Env* const env, const std::string& filepath,
                                 std::unique_ptr<RandomAccessFile>* file,
                                 const bool use_direct_reads = true) {
+  assert(env);
+
   EnvOptions opt;
   opt.use_direct_reads = use_direct_reads;
   Status s = env->NewRandomAccessFile(filepath, file, opt);
@@ -44,6 +46,8 @@ Status NewRandomAccessCacheFile(Env* const env, const std::string& filepath,
 // BlockCacheFile
 //
 Status BlockCacheFile::Delete(uint64_t* size) {
+  assert(env_);
+
   Status status = env_->GetFileSize(Path(), size);
   if (!status.ok()) {
     return status;
@@ -287,6 +291,8 @@ bool WriteableCacheFile::Create(const bool /*enable_direct_writes*/,
   ROCKS_LOG_DEBUG(log_, "Creating new cache %s (max size is %d B)",
                   Path().c_str(), max_size_);
 
+  assert(env_);
+
   Status s = env_->FileExists(Path());
   if (s.ok()) {
     ROCKS_LOG_WARN(log_, "File %s already exists. %s", Path().c_str(),
@@ -359,6 +365,8 @@ bool WriteableCacheFile::ExpandBuffer(const size_t size) {
 
   // expand the buffer until there is enough space to write `size` bytes
   assert(free < size);
+  assert(alloc_);
+
   while (free < size) {
     CacheWriteBuffer* const buf = alloc_->Allocate();
     if (!buf) {
@@ -394,6 +402,7 @@ void WriteableCacheFile::DispatchBuffer() {
   assert(eof_ || buf_doff_ < buf_woff_);
   assert(buf_doff_ < bufs_.size());
   assert(file_);
+  assert(alloc_);
 
   auto* buf = bufs_[buf_doff_];
   const uint64_t file_off = buf_doff_ * alloc_->BufferSize();
@@ -453,6 +462,7 @@ bool WriteableCacheFile::ReadBuffer(const LBA& lba, char* data) {
   rwlock_.AssertHeld();
 
   assert(lba.off_ < disk_woff_);
+  assert(alloc_);
 
   // we read from the buffers like reading from a flat file. The list of buffers
   // are treated as contiguous stream of data
@@ -511,6 +521,8 @@ void WriteableCacheFile::Close() {
 }
 
 void WriteableCacheFile::ClearBuffers() {
+  assert(alloc_);
+
   for (size_t i = 0; i < bufs_.size(); ++i) {
     alloc_->Deallocate(bufs_[i]);
   }
diff --git a/utilities/persistent_cache/block_cache_tier_file.h b/utilities/persistent_cache/block_cache_tier_file.h
index b7f820b0681..ddea4032cbd 100644
--- a/utilities/persistent_cache/block_cache_tier_file.h
+++ b/utilities/persistent_cache/block_cache_tier_file.h
@@ -11,6 +11,8 @@
 #include <string>
 #include <vector>
 
+#include "file/random_access_file_reader.h"
+
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
 
@@ -21,7 +23,6 @@
 
 #include "port/port.h"
 #include "util/crc32c.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 
 // The io code path of persistent cache uses pipelined architecture
diff --git a/utilities/persistent_cache/block_cache_tier_file_buffer.h b/utilities/persistent_cache/block_cache_tier_file_buffer.h
index 9d9465c6ca9..e4f8f5ba4b2 100644
--- a/utilities/persistent_cache/block_cache_tier_file_buffer.h
+++ b/utilities/persistent_cache/block_cache_tier_file_buffer.h
@@ -9,7 +9,7 @@
 #include <string>
 
 #include "include/rocksdb/comparator.h"
-#include "util/arena.h"
+#include "memory/arena.h"
 #include "util/mutexlock.h"
 
 namespace rocksdb {
diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc
index d6ff3e68e42..9cc1534973e 100644
--- a/utilities/persistent_cache/hash_table_test.cc
+++ b/utilities/persistent_cache/hash_table_test.cc
@@ -9,9 +9,9 @@
 #include <string>
 
 #include "db/db_test_util.h"
-#include "util/arena.h"
+#include "memory/arena.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
-#include "util/testharness.h"
 #include "utilities/persistent_cache/hash_table.h"
 #include "utilities/persistent_cache/hash_table_evictable.h"
 
diff --git a/utilities/persistent_cache/persistent_cache_bench.cc b/utilities/persistent_cache/persistent_cache_bench.cc
index 64d75c7a518..8bc795455cd 100644
--- a/utilities/persistent_cache/persistent_cache_bench.cc
+++ b/utilities/persistent_cache/persistent_cache_bench.cc
@@ -23,7 +23,7 @@ int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
 
 #include "monitoring/histogram.h"
 #include "port/port.h"
-#include "table/block_builder.h"
+#include "table/block_based/block_builder.h"
 #include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
diff --git a/utilities/persistent_cache/persistent_cache_test.cc b/utilities/persistent_cache/persistent_cache_test.cc
index e3b1e39e0e2..e52cf9bbcf3 100644
--- a/utilities/persistent_cache/persistent_cache_test.cc
+++ b/utilities/persistent_cache/persistent_cache_test.cc
@@ -6,7 +6,10 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#ifndef ROCKSDB_LITE
+
+// GetUniqueIdFromFile is not implemented on Windows. Persistent cache
+// breaks when that function is not implemented
+#if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
 
 #include "utilities/persistent_cache/persistent_cache_test.h"
 
@@ -466,6 +469,6 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
-#else
+#else   // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
 int main() { return 0; }
-#endif
+#endif  // !defined(ROCKSDB_LITE) && !defined(OS_WIN)
diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h
index ad99ea864bd..29c334442c5 100644
--- a/utilities/persistent_cache/persistent_cache_test.h
+++ b/utilities/persistent_cache/persistent_cache_test.h
@@ -19,11 +19,11 @@
 #include <vector>
 
 #include "db/db_test_util.h"
-#include "rocksdb/cache.h"
-#include "table/block_builder.h"
+#include "memory/arena.h"
 #include "port/port.h"
-#include "util/arena.h"
-#include "util/testharness.h"
+#include "rocksdb/cache.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/testharness.h"
 #include "utilities/persistent_cache/volatile_tier_impl.h"
 
 namespace rocksdb {
diff --git a/utilities/persistent_cache/persistent_cache_tier.cc b/utilities/persistent_cache/persistent_cache_tier.cc
index 732762a1652..01b675aec57 100644
--- a/utilities/persistent_cache/persistent_cache_tier.cc
+++ b/utilities/persistent_cache/persistent_cache_tier.cc
@@ -5,16 +5,11 @@
 //
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/persistent_cache/persistent_cache_tier.h"
 
-#include "inttypes.h"
-
-#include <string>
+#include <cinttypes>
 #include <sstream>
+#include <string>
 
 namespace rocksdb {
 
diff --git a/utilities/persistent_cache/persistent_cache_util.h b/utilities/persistent_cache/persistent_cache_util.h
index 214bb5875d6..254c038f985 100644
--- a/utilities/persistent_cache/persistent_cache_util.h
+++ b/utilities/persistent_cache/persistent_cache_util.h
@@ -48,7 +48,7 @@ class BoundedQueue {
     T t = std::move(q_.front());
     size_ -= t.Size();
     q_.pop_front();
-    return std::move(t);
+    return t;
   }
 
   size_t Size() const {
diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc
new file mode 100644
index 00000000000..98a5c8a695f
--- /dev/null
+++ b/utilities/simulator_cache/cache_simulator.cc
@@ -0,0 +1,274 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/simulator_cache/cache_simulator.h"
+#include <algorithm>
+#include "db/dbformat.h"
+
+namespace rocksdb {
+
+namespace {
+const std::string kGhostCachePrefix = "ghost_";
+}  // namespace
+
+GhostCache::GhostCache(std::shared_ptr<Cache> sim_cache)
+    : sim_cache_(sim_cache) {}
+
+bool GhostCache::Admit(const Slice& lookup_key) {
+  auto handle = sim_cache_->Lookup(lookup_key);
+  if (handle != nullptr) {
+    sim_cache_->Release(handle);
+    return true;
+  }
+  sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(),
+                     /*deleter=*/nullptr);
+  return false;
+}
+
+CacheSimulator::CacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                               std::shared_ptr<Cache> sim_cache)
+    : ghost_cache_(std::move(ghost_cache)), sim_cache_(sim_cache) {}
+
+void CacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  bool admit = true;
+  const bool is_user_access =
+      BlockCacheTraceHelper::IsUserAccess(access.caller);
+  bool is_cache_miss = true;
+  if (ghost_cache_ && access.no_insert == Boolean::kFalse) {
+    admit = ghost_cache_->Admit(access.block_key);
+  }
+  auto handle = sim_cache_->Lookup(access.block_key);
+  if (handle != nullptr) {
+    sim_cache_->Release(handle);
+    is_cache_miss = false;
+  } else {
+    if (access.no_insert == Boolean::kFalse && admit && access.block_size > 0) {
+      sim_cache_->Insert(access.block_key, /*value=*/nullptr, access.block_size,
+                         /*deleter=*/nullptr);
+    }
+  }
+  miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
+                                  is_cache_miss);
+}
+
+void MissRatioStats::UpdateMetrics(uint64_t timestamp_in_ms,
+                                   bool is_user_access, bool is_cache_miss) {
+  uint64_t timestamp_in_seconds = timestamp_in_ms / kMicrosInSecond;
+  num_accesses_timeline_[timestamp_in_seconds] += 1;
+  num_accesses_ += 1;
+  if (num_misses_timeline_.find(timestamp_in_seconds) ==
+      num_misses_timeline_.end()) {
+    num_misses_timeline_[timestamp_in_seconds] = 0;
+  }
+  if (is_cache_miss) {
+    num_misses_ += 1;
+    num_misses_timeline_[timestamp_in_seconds] += 1;
+  }
+  if (is_user_access) {
+    user_accesses_ += 1;
+    if (is_cache_miss) {
+      user_misses_ += 1;
+    }
+  }
+}
+
+Cache::Priority PrioritizedCacheSimulator::ComputeBlockPriority(
+    const BlockCacheTraceRecord& access) const {
+  if (access.block_type == TraceType::kBlockTraceFilterBlock ||
+      access.block_type == TraceType::kBlockTraceIndexBlock ||
+      access.block_type == TraceType::kBlockTraceUncompressionDictBlock) {
+    return Cache::Priority::HIGH;
+  }
+  return Cache::Priority::LOW;
+}
+
+void PrioritizedCacheSimulator::AccessKVPair(
+    const Slice& key, uint64_t value_size, Cache::Priority priority,
+    const BlockCacheTraceRecord& access, bool no_insert, bool is_user_access,
+    bool* is_cache_miss, bool* admitted, bool update_metrics) {
+  assert(is_cache_miss);
+  assert(admitted);
+  *is_cache_miss = true;
+  *admitted = true;
+  if (ghost_cache_ && !no_insert) {
+    *admitted = ghost_cache_->Admit(key);
+  }
+  auto handle = sim_cache_->Lookup(key);
+  if (handle != nullptr) {
+    sim_cache_->Release(handle);
+    *is_cache_miss = false;
+  } else if (!no_insert && *admitted && value_size > 0) {
+    sim_cache_->Insert(key, /*value=*/nullptr, value_size, /*deleter=*/nullptr,
+                       /*handle=*/nullptr, priority);
+  }
+  if (update_metrics) {
+    miss_ratio_stats_.UpdateMetrics(access.access_timestamp, is_user_access,
+                                    *is_cache_miss);
+  }
+}
+
+void PrioritizedCacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  bool is_cache_miss = true;
+  bool admitted = true;
+  AccessKVPair(access.block_key, access.block_size,
+               ComputeBlockPriority(access), access, access.no_insert,
+               BlockCacheTraceHelper::IsUserAccess(access.caller),
+               &is_cache_miss, &admitted, /*update_metrics=*/true);
+}
+
+void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) {
+  // TODO (haoyu): We only support Get for now. We need to extend the tracing
+  // for MultiGet, i.e., non-data block accesses must log all keys in a
+  // MultiGet.
+  bool is_cache_miss = true;
+  bool admitted = false;
+  if (access.caller == TableReaderCaller::kUserGet &&
+      access.get_id != BlockCacheTraceHelper::kReservedGetId) {
+    // This is a Get request.
+    const std::string& row_key = BlockCacheTraceHelper::ComputeRowKey(access);
+    GetRequestStatus& status = getid_status_map_[access.get_id];
+    if (status.is_complete) {
+      // This Get request completes.
+      // Skip future accesses to its index/filter/data
+      // blocks. These block lookups are unnecessary if we observe a hit for the
+      // referenced key-value pair already. Thus, we treat these lookups as
+      // hits. This is also to ensure the total number of accesses are the same
+      // when comparing to other policies.
+      miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+                                      /*is_user_access=*/true,
+                                      /*is_cache_miss=*/false);
+      return;
+    }
+    if (status.row_key_status.find(row_key) == status.row_key_status.end()) {
+      // This is the first time that this key is accessed. Look up the key-value
+      // pair first. Do not update the miss/accesses metrics here since it will
+      // be updated later.
+      AccessKVPair(row_key, access.referenced_data_size, Cache::Priority::HIGH,
+                   access,
+                   /*no_insert=*/false,
+                   /*is_user_access=*/true, &is_cache_miss, &admitted,
+                   /*update_metrics=*/false);
+      InsertResult result = InsertResult::NO_INSERT;
+      if (admitted && access.referenced_data_size > 0) {
+        result = InsertResult::INSERTED;
+      } else if (admitted) {
+        result = InsertResult::ADMITTED;
+      }
+      status.row_key_status[row_key] = result;
+    }
+    if (!is_cache_miss) {
+      // A cache hit.
+      status.is_complete = true;
+      miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
+                                      /*is_user_access=*/true,
+                                      /*is_cache_miss=*/false);
+      return;
+    }
+    // The row key-value pair observes a cache miss. We need to access its
+    // index/filter/data blocks.
+    InsertResult inserted = status.row_key_status[row_key];
+    AccessKVPair(
+        access.block_key, access.block_size, ComputeBlockPriority(access),
+        access,
+        /*no_insert=*/!insert_blocks_upon_row_kvpair_miss_ || access.no_insert,
+        /*is_user_access=*/true, &is_cache_miss, &admitted,
+        /*update_metrics=*/true);
+    if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) {
+      sim_cache_->Insert(row_key, /*value=*/nullptr,
+                         access.referenced_data_size, /*deleter=*/nullptr,
+                         /*handle=*/nullptr, Cache::Priority::HIGH);
+      status.row_key_status[row_key] = InsertResult::INSERTED;
+    }
+    return;
+  }
+  AccessKVPair(access.block_key, access.block_size,
+               ComputeBlockPriority(access), access, access.no_insert,
+               BlockCacheTraceHelper::IsUserAccess(access.caller),
+               &is_cache_miss, &admitted, /*update_metrics=*/true);
+}
+
+BlockCacheTraceSimulator::BlockCacheTraceSimulator(
+    uint64_t warmup_seconds, uint32_t downsample_ratio,
+    const std::vector<CacheConfiguration>& cache_configurations)
+    : warmup_seconds_(warmup_seconds),
+      downsample_ratio_(downsample_ratio),
+      cache_configurations_(cache_configurations) {}
+
+Status BlockCacheTraceSimulator::InitializeCaches() {
+  for (auto const& config : cache_configurations_) {
+    for (auto cache_capacity : config.cache_capacities) {
+      // Scale down the cache capacity since the trace contains accesses on
+      // 1/'downsample_ratio' blocks.
+      uint64_t simulate_cache_capacity = cache_capacity / downsample_ratio_;
+      std::shared_ptr<CacheSimulator> sim_cache;
+      std::unique_ptr<GhostCache> ghost_cache;
+      std::string cache_name = config.cache_name;
+      if (cache_name.find(kGhostCachePrefix) != std::string::npos) {
+        ghost_cache.reset(new GhostCache(
+            NewLRUCache(config.ghost_cache_capacity, /*num_shard_bits=*/1,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0)));
+        cache_name = cache_name.substr(kGhostCachePrefix.size());
+      }
+      if (cache_name == "lru") {
+        sim_cache = std::make_shared<CacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0));
+      } else if (cache_name == "lru_priority") {
+        sim_cache = std::make_shared<PrioritizedCacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0.5));
+      } else if (cache_name == "lru_hybrid") {
+        sim_cache = std::make_shared<HybridRowBlockCacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0.5),
+            /*insert_blocks_upon_row_kvpair_miss=*/true);
+      } else if (cache_name == "lru_hybrid_no_insert_on_row_miss") {
+        sim_cache = std::make_shared<HybridRowBlockCacheSimulator>(
+            std::move(ghost_cache),
+            NewLRUCache(simulate_cache_capacity, config.num_shard_bits,
+                        /*strict_capacity_limit=*/false,
+                        /*high_pri_pool_ratio=*/0.5),
+            /*insert_blocks_upon_row_kvpair_miss=*/false);
+      } else {
+        // Not supported.
+        return Status::InvalidArgument("Unknown cache name " +
+                                       config.cache_name);
+      }
+      sim_caches_[config].push_back(sim_cache);
+    }
+  }
+  return Status::OK();
+}
+
+void BlockCacheTraceSimulator::Access(const BlockCacheTraceRecord& access) {
+  if (trace_start_time_ == 0) {
+    trace_start_time_ = access.access_timestamp;
+  }
+  // access.access_timestamp is in microseconds.
+  if (!warmup_complete_ &&
+      trace_start_time_ + warmup_seconds_ * kMicrosInSecond <=
+          access.access_timestamp) {
+    for (auto& config_caches : sim_caches_) {
+      for (auto& sim_cache : config_caches.second) {
+        sim_cache->reset_counter();
+      }
+    }
+    warmup_complete_ = true;
+  }
+  for (auto& config_caches : sim_caches_) {
+    for (auto& sim_cache : config_caches.second) {
+      sim_cache->Access(access);
+    }
+  }
+}
+
+}  // namespace rocksdb
diff --git a/utilities/simulator_cache/cache_simulator.h b/utilities/simulator_cache/cache_simulator.h
new file mode 100644
index 00000000000..6f2a7e84d2b
--- /dev/null
+++ b/utilities/simulator_cache/cache_simulator.h
@@ -0,0 +1,231 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <unordered_map>
+
+#include "cache/lru_cache.h"
+#include "trace_replay/block_cache_tracer.h"
+
+namespace rocksdb {
+
+// A cache configuration provided by user.
+struct CacheConfiguration {
+  std::string cache_name;  // LRU.
+  uint32_t num_shard_bits;
+  uint64_t ghost_cache_capacity;  // ghost cache capacity in bytes.
+  std::vector<uint64_t>
+      cache_capacities;  // simulate cache capacities in bytes.
+
+  bool operator==(const CacheConfiguration& o) const {
+    return cache_name == o.cache_name && num_shard_bits == o.num_shard_bits &&
+           ghost_cache_capacity == o.ghost_cache_capacity;
+  }
+  bool operator<(const CacheConfiguration& o) const {
+    return cache_name < o.cache_name ||
+           (cache_name == o.cache_name && num_shard_bits < o.num_shard_bits) ||
+           (cache_name == o.cache_name && num_shard_bits == o.num_shard_bits &&
+            ghost_cache_capacity < o.ghost_cache_capacity);
+  }
+};
+
+class MissRatioStats {
+ public:
+  void reset_counter() {
+    num_misses_ = 0;
+    num_accesses_ = 0;
+    user_accesses_ = 0;
+    user_misses_ = 0;
+  }
+  double miss_ratio() const {
+    if (num_accesses_ == 0) {
+      return -1;
+    }
+    return static_cast<double>(num_misses_ * 100.0 / num_accesses_);
+  }
+  uint64_t total_accesses() const { return num_accesses_; }
+  uint64_t total_misses() const { return num_misses_; }
+
+  const std::map<uint64_t, uint64_t>& num_accesses_timeline() const {
+    return num_accesses_timeline_;
+  }
+
+  const std::map<uint64_t, uint64_t>& num_misses_timeline() const {
+    return num_misses_timeline_;
+  }
+
+  double user_miss_ratio() const {
+    if (user_accesses_ == 0) {
+      return -1;
+    }
+    return static_cast<double>(user_misses_ * 100.0 / user_accesses_);
+  }
+  uint64_t user_accesses() const { return user_accesses_; }
+  uint64_t user_misses() const { return user_misses_; }
+
+  void UpdateMetrics(uint64_t timestamp_in_ms, bool is_user_access,
+                     bool is_cache_miss);
+
+ private:
+  uint64_t num_accesses_ = 0;
+  uint64_t num_misses_ = 0;
+  uint64_t user_accesses_ = 0;
+  uint64_t user_misses_ = 0;
+
+  std::map<uint64_t, uint64_t> num_accesses_timeline_;
+  std::map<uint64_t, uint64_t> num_misses_timeline_;
+};
+
+// A ghost cache admits an entry on its second access.
+class GhostCache {
+ public:
+  explicit GhostCache(std::shared_ptr<Cache> sim_cache);
+  ~GhostCache() = default;
+  // No copy and move.
+  GhostCache(const GhostCache&) = delete;
+  GhostCache& operator=(const GhostCache&) = delete;
+  GhostCache(GhostCache&&) = delete;
+  GhostCache& operator=(GhostCache&&) = delete;
+
+  // Returns true if the lookup_key is in the ghost cache.
+  // Returns false otherwise.
+  bool Admit(const Slice& lookup_key);
+
+ private:
+  std::shared_ptr<Cache> sim_cache_;
+};
+
+// A cache simulator that runs against a block cache trace.
+class CacheSimulator {
+ public:
+  CacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                 std::shared_ptr<Cache> sim_cache);
+  virtual ~CacheSimulator() = default;
+  // No copy and move.
+  CacheSimulator(const CacheSimulator&) = delete;
+  CacheSimulator& operator=(const CacheSimulator&) = delete;
+  CacheSimulator(CacheSimulator&&) = delete;
+  CacheSimulator& operator=(CacheSimulator&&) = delete;
+
+  virtual void Access(const BlockCacheTraceRecord& access);
+
+  void reset_counter() { miss_ratio_stats_.reset_counter(); }
+
+  const MissRatioStats& miss_ratio_stats() const { return miss_ratio_stats_; }
+
+ protected:
+  MissRatioStats miss_ratio_stats_;
+  std::unique_ptr<GhostCache> ghost_cache_;
+  std::shared_ptr<Cache> sim_cache_;
+};
+
+// A prioritized cache simulator that runs against a block cache trace.
+// It inserts missing index/filter/uncompression-dictionary blocks with high
+// priority in the cache.
+class PrioritizedCacheSimulator : public CacheSimulator {
+ public:
+  PrioritizedCacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                            std::shared_ptr<Cache> sim_cache)
+      : CacheSimulator(std::move(ghost_cache), sim_cache) {}
+  void Access(const BlockCacheTraceRecord& access) override;
+
+ protected:
+  // Access the key-value pair and returns true upon a cache miss.
+  void AccessKVPair(const Slice& key, uint64_t value_size,
+                    Cache::Priority priority,
+                    const BlockCacheTraceRecord& access, bool no_insert,
+                    bool is_user_access, bool* is_cache_miss, bool* admitted,
+                    bool update_metrics);
+
+  Cache::Priority ComputeBlockPriority(
+      const BlockCacheTraceRecord& access) const;
+};
+
+// A hybrid row and block cache simulator. It looks up/inserts key-value pairs
+// referenced by Get/MultiGet requests, and not their accessed index/filter/data
+// blocks.
+//
+// Upon a Get/MultiGet request, it looks up the referenced key first.
+// If it observes a cache hit, future block accesses on this key-value pair is
+// skipped since the request is served already. Otherwise, it continues to look
+// up/insert its index/filter/data blocks. It also inserts the referenced
+// key-value pair in the cache for future lookups.
+class HybridRowBlockCacheSimulator : public PrioritizedCacheSimulator {
+ public:
+  HybridRowBlockCacheSimulator(std::unique_ptr<GhostCache>&& ghost_cache,
+                               std::shared_ptr<Cache> sim_cache,
+                               bool insert_blocks_upon_row_kvpair_miss)
+      : PrioritizedCacheSimulator(std::move(ghost_cache), sim_cache),
+        insert_blocks_upon_row_kvpair_miss_(
+            insert_blocks_upon_row_kvpair_miss) {}
+  void Access(const BlockCacheTraceRecord& access) override;
+
+ private:
+  enum InsertResult : char {
+    INSERTED,
+    ADMITTED,
+    NO_INSERT,
+  };
+
+  // We set is_complete to true when the referenced row-key of a get request
+  // hits the cache. If is_complete is true, we treat future accesses of this
+  // get request as hits.
+  //
+  // For each row key, it stores an enum. It is INSERTED when the
+  // kv-pair has been inserted into the cache, ADMITTED if it should be inserted
+  // but haven't been, NO_INSERT if it should not be inserted.
+  //
+  // A kv-pair is in ADMITTED state when we encounter this kv-pair but do not
+  // know its size. This may happen if the first access on the referenced key is
+  // an index/filter block.
+  struct GetRequestStatus {
+    bool is_complete = false;
+    std::map<std::string, InsertResult> row_key_status;
+  };
+
+  // A map stores get_id to a map of row keys.
+  std::map<uint64_t, GetRequestStatus> getid_status_map_;
+  bool insert_blocks_upon_row_kvpair_miss_;
+};
+
+// A block cache simulator that reports miss ratio curves given a set of cache
+// configurations.
+class BlockCacheTraceSimulator {
+ public:
+  // warmup_seconds: The number of seconds to warmup simulated caches. The
+  // hit/miss counters are reset after the warmup completes.
+  BlockCacheTraceSimulator(
+      uint64_t warmup_seconds, uint32_t downsample_ratio,
+      const std::vector<CacheConfiguration>& cache_configurations);
+  ~BlockCacheTraceSimulator() = default;
+  // No copy and move.
+  BlockCacheTraceSimulator(const BlockCacheTraceSimulator&) = delete;
+  BlockCacheTraceSimulator& operator=(const BlockCacheTraceSimulator&) = delete;
+  BlockCacheTraceSimulator(BlockCacheTraceSimulator&&) = delete;
+  BlockCacheTraceSimulator& operator=(BlockCacheTraceSimulator&&) = delete;
+
+  Status InitializeCaches();
+
+  void Access(const BlockCacheTraceRecord& access);
+
+  const std::map<CacheConfiguration,
+                 std::vector<std::shared_ptr<CacheSimulator>>>&
+  sim_caches() const {
+    return sim_caches_;
+  }
+
+ private:
+  const uint64_t warmup_seconds_;
+  const uint32_t downsample_ratio_;
+  const std::vector<CacheConfiguration> cache_configurations_;
+
+  bool warmup_complete_ = false;
+  std::map<CacheConfiguration, std::vector<std::shared_ptr<CacheSimulator>>>
+      sim_caches_;
+  uint64_t trace_start_time_ = 0;
+};
+
+}  // namespace rocksdb
diff --git a/utilities/simulator_cache/cache_simulator_test.cc b/utilities/simulator_cache/cache_simulator_test.cc
new file mode 100644
index 00000000000..085e113ffe3
--- /dev/null
+++ b/utilities/simulator_cache/cache_simulator_test.cc
@@ -0,0 +1,494 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/simulator_cache/cache_simulator.h"
+
+#include <cstdlib>
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace rocksdb {
+namespace {
+const std::string kBlockKeyPrefix = "test-block-";
+const std::string kRefKeyPrefix = "test-get-";
+const std::string kRefKeySequenceNumber = std::string(8, 'c');
+const uint64_t kGetId = 1;
+const uint64_t kGetBlockId = 100;
+const uint64_t kCompactionBlockId = 1000;
+const uint64_t kCacheSize = 1024 * 1024 * 1024;
+const uint64_t kGhostCacheSize = 1024 * 1024;
+}  // namespace
+
+class CacheSimulatorTest : public testing::Test {
+ public:
+  const size_t kNumBlocks = 5;
+  const size_t kValueSize = 1000;
+
+  CacheSimulatorTest() { env_ = rocksdb::Env::Default(); }
+
+  BlockCacheTraceRecord GenerateGetRecord(uint64_t getid) {
+    BlockCacheTraceRecord record;
+    record.block_type = TraceType::kBlockTraceDataBlock;
+    record.block_size = 4096;
+    record.block_key = kBlockKeyPrefix + std::to_string(kGetBlockId);
+    record.access_timestamp = env_->NowMicros();
+    record.cf_id = 0;
+    record.cf_name = "test";
+    record.caller = TableReaderCaller::kUserGet;
+    record.level = 6;
+    record.sst_fd_number = 0;
+    record.get_id = getid;
+    record.is_cache_hit = Boolean::kFalse;
+    record.no_insert = Boolean::kFalse;
+    record.referenced_key =
+        kRefKeyPrefix + std::to_string(kGetId) + kRefKeySequenceNumber;
+    record.referenced_key_exist_in_block = Boolean::kTrue;
+    record.referenced_data_size = 100;
+    record.num_keys_in_block = 300;
+    return record;
+  }
+
+  BlockCacheTraceRecord GenerateCompactionRecord() {
+    BlockCacheTraceRecord record;
+    record.block_type = TraceType::kBlockTraceDataBlock;
+    record.block_size = 4096;
+    record.block_key = kBlockKeyPrefix + std::to_string(kCompactionBlockId);
+    record.access_timestamp = env_->NowMicros();
+    record.cf_id = 0;
+    record.cf_name = "test";
+    record.caller = TableReaderCaller::kCompaction;
+    record.level = 6;
+    record.sst_fd_number = kCompactionBlockId;
+    record.is_cache_hit = Boolean::kFalse;
+    record.no_insert = Boolean::kTrue;
+    return record;
+  }
+
+  void AssertCache(std::shared_ptr<Cache> sim_cache,
+                   const MissRatioStats& miss_ratio_stats,
+                   uint64_t expected_usage, uint64_t expected_num_accesses,
+                   uint64_t expected_num_misses,
+                   std::vector<std::string> blocks,
+                   std::vector<std::string> keys) {
+    EXPECT_EQ(expected_usage, sim_cache->GetUsage());
+    EXPECT_EQ(expected_num_accesses, miss_ratio_stats.total_accesses());
+    EXPECT_EQ(expected_num_misses, miss_ratio_stats.total_misses());
+    for (auto const& block : blocks) {
+      auto handle = sim_cache->Lookup(block);
+      EXPECT_NE(nullptr, handle);
+      sim_cache->Release(handle);
+    }
+    for (auto const& key : keys) {
+      std::string row_key = kRefKeyPrefix + key + kRefKeySequenceNumber;
+      auto handle =
+          sim_cache->Lookup("0_" + ExtractUserKey(row_key).ToString());
+      EXPECT_NE(nullptr, handle);
+      sim_cache->Release(handle);
+    }
+  }
+
+  Env* env_;
+};
+
+TEST_F(CacheSimulatorTest, GhostCache) {
+  const std::string key1 = "test1";
+  const std::string key2 = "test2";
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  EXPECT_FALSE(ghost_cache->Admit(key1));
+  EXPECT_TRUE(ghost_cache->Admit(key1));
+  EXPECT_TRUE(ghost_cache->Admit(key1));
+  EXPECT_FALSE(ghost_cache->Admit(key2));
+  EXPECT_TRUE(ghost_cache->Admit(key2));
+}
+
+TEST_F(CacheSimulatorTest, CacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  const BlockCacheTraceRecord& compaction_access = GenerateCompactionRecord();
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<CacheSimulator> cache_simulator(
+      new CacheSimulator(nullptr, sim_cache));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio());
+
+  cache_simulator->Access(compaction_access);
+  cache_simulator->Access(compaction_access);
+  ASSERT_EQ(4, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(75, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().user_miss_ratio());
+
+  cache_simulator->reset_counter();
+  ASSERT_EQ(0, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(-1, cache_simulator->miss_ratio_stats().miss_ratio());
+  auto handle = sim_cache->Lookup(access.block_key);
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  handle = sim_cache->Lookup(compaction_access.block_key);
+  ASSERT_EQ(nullptr, handle);
+}
+
+TEST_F(CacheSimulatorTest, GhostCacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  std::unique_ptr<CacheSimulator> cache_simulator(new CacheSimulator(
+      std::move(ghost_cache),
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  // Both of them will be miss since we have a ghost cache.
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+}
+
+TEST_F(CacheSimulatorTest, PrioritizedCacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<PrioritizedCacheSimulator> cache_simulator(
+      new PrioritizedCacheSimulator(nullptr, sim_cache));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(50, cache_simulator->miss_ratio_stats().miss_ratio());
+
+  auto handle = sim_cache->Lookup(access.block_key);
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+}
+
+TEST_F(CacheSimulatorTest, GhostPrioritizedCacheSimulator) {
+  const BlockCacheTraceRecord& access = GenerateGetRecord(kGetId);
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  std::unique_ptr<PrioritizedCacheSimulator> cache_simulator(
+      new PrioritizedCacheSimulator(
+          std::move(ghost_cache),
+          NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                      /*strict_capacity_limit=*/false,
+                      /*high_pri_pool_ratio=*/0)));
+  cache_simulator->Access(access);
+  cache_simulator->Access(access);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  // Both of them will be miss since we have a ghost cache.
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulator) {
+  uint64_t block_id = 100;
+  BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);
+  first_get.get_from_user_specified_snapshot = Boolean::kTrue;
+  BlockCacheTraceRecord second_get = GenerateGetRecord(kGetId + 1);
+  second_get.referenced_data_size = 0;
+  second_get.referenced_key_exist_in_block = Boolean::kFalse;
+  second_get.get_from_user_specified_snapshot = Boolean::kTrue;
+  BlockCacheTraceRecord third_get = GenerateGetRecord(kGetId + 2);
+  third_get.referenced_data_size = 0;
+  third_get.referenced_key_exist_in_block = Boolean::kFalse;
+  third_get.referenced_key = kRefKeyPrefix + "third_get";
+  // We didn't find the referenced key in the third get.
+  third_get.referenced_key_exist_in_block = Boolean::kFalse;
+  third_get.referenced_data_size = 0;
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
+  // The first get request accesses 10 blocks. We should only report 10 accesses
+  // and 100% miss.
+  for (uint32_t i = 0; i < 10; i++) {
+    first_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(first_get);
+    block_id++;
+  }
+
+  ASSERT_EQ(10, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(10, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio());
+  auto handle =
+      sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" +
+                        ExtractUserKey(first_get.referenced_key).ToString());
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  for (uint32_t i = 100; i < block_id; i++) {
+    handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+    ASSERT_NE(nullptr, handle);
+    sim_cache->Release(handle);
+  }
+
+  // The second get request accesses the same key. We should report 15
+  // access and 66% miss, 10 misses with 15 accesses.
+  // We do not consider these 5 block lookups as misses since the row hits the
+  // cache.
+  for (uint32_t i = 0; i < 5; i++) {
+    second_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(second_get);
+    block_id++;
+  }
+  ASSERT_EQ(15, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(66, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().miss_ratio()));
+  ASSERT_EQ(15, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(66, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().user_miss_ratio()));
+  handle =
+      sim_cache->Lookup(std::to_string(second_get.sst_fd_number) + "_" +
+                        ExtractUserKey(second_get.referenced_key).ToString());
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  for (uint32_t i = 100; i < block_id; i++) {
+    handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+    if (i < 110) {
+      ASSERT_NE(nullptr, handle) << i;
+      sim_cache->Release(handle);
+    } else {
+      ASSERT_EQ(nullptr, handle) << i;
+    }
+  }
+
+  // The third get on a different key and does not have a size.
+  // This key should not be inserted into the cache.
+  for (uint32_t i = 0; i < 5; i++) {
+    third_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(third_get);
+    block_id++;
+  }
+  ASSERT_EQ(20, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(75, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().miss_ratio()));
+  ASSERT_EQ(20, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(75, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().user_miss_ratio()));
+  // Assert that the third key is not inserted into the cache.
+  handle = sim_cache->Lookup(std::to_string(third_get.sst_fd_number) + "_" +
+                             third_get.referenced_key);
+  ASSERT_EQ(nullptr, handle);
+  for (uint32_t i = 100; i < block_id; i++) {
+    if (i < 110 || i >= 115) {
+      handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+      ASSERT_NE(nullptr, handle) << i;
+      sim_cache->Release(handle);
+    } else {
+      handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+      ASSERT_EQ(nullptr, handle) << i;
+    }
+  }
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockCacheSimulatorGetTest) {
+  BlockCacheTraceRecord get = GenerateGetRecord(kGetId);
+  get.block_size = 1;
+  get.referenced_data_size = 0;
+  get.access_timestamp = 0;
+  get.block_key = "1";
+  get.get_id = 1;
+  get.get_from_user_specified_snapshot = Boolean::kFalse;
+  get.referenced_key =
+      kRefKeyPrefix + std::to_string(1) + kRefKeySequenceNumber;
+  get.no_insert = Boolean::kFalse;
+  get.sst_fd_number = 0;
+  get.get_from_user_specified_snapshot = Boolean::kFalse;
+
+  LRUCacheOptions co;
+  co.capacity = 16;
+  co.num_shard_bits = 1;
+  co.strict_capacity_limit = false;
+  co.high_pri_pool_ratio = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> sim_cache = NewLRUCache(co);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/true));
+  // Expect a miss and does not insert the row key-value pair since it does not
+  // have size.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 1, 1, 1, {"1"},
+              {});
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.block_key = "2";
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 3, 2, 2,
+              {"1", "2"}, {"1"});
+  get.access_timestamp += 1;
+  get.block_key = "3";
+  // K1 should not inserted again.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 3, 3,
+              {"1", "2", "3"}, {"1"});
+
+  // A second get request referencing the same key.
+  get.access_timestamp += 1;
+  get.get_id = 2;
+  get.block_key = "4";
+  get.referenced_data_size = 0;
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 4, 4, 3,
+              {"1", "2", "3"}, {"1"});
+
+  // A third get request searches three files, three different keys.
+  // And the second key observes a hit.
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.get_id = 3;
+  get.block_key = "3";
+  get.referenced_key = kRefKeyPrefix + "2" + kRefKeySequenceNumber;
+  // K2 should observe a miss. Block 3 observes a hit.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 5, 3,
+              {"1", "2", "3"}, {"1", "2"});
+
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.get_id = 3;
+  get.block_key = "4";
+  get.referenced_data_size = 1;
+  get.referenced_key = kRefKeyPrefix + "1" + kRefKeySequenceNumber;
+  // K1 should observe a hit.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 6, 3,
+              {"1", "2", "3"}, {"1", "2"});
+
+  get.access_timestamp += 1;
+  get.referenced_data_size = 1;
+  get.get_id = 3;
+  get.block_key = "4";
+  get.referenced_data_size = 1;
+  get.referenced_key = kRefKeyPrefix + "3" + kRefKeySequenceNumber;
+  // K3 should observe a miss.
+  // However, as the get already complete, we should not access k3 any more.
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 5, 7, 3,
+              {"1", "2", "3"}, {"1", "2"});
+
+  // A fourth get request searches one file and two blocks. One row key.
+  get.access_timestamp += 1;
+  get.get_id = 4;
+  get.block_key = "5";
+  get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
+  get.referenced_data_size = 1;
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 7, 8, 4,
+              {"1", "2", "3", "5"}, {"1", "2", "4"});
+  for (auto const& key : {"1", "2", "4"}) {
+    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key);
+    ASSERT_NE(nullptr, handle);
+    sim_cache->Release(handle);
+  }
+
+  // A bunch of insertions which evict cached row keys.
+  for (uint32_t i = 6; i < 100; i++) {
+    get.access_timestamp += 1;
+    get.get_id = 0;
+    get.block_key = std::to_string(i);
+    cache_simulator->Access(get);
+  }
+
+  get.get_id = 4;
+  // A different block.
+  get.block_key = "100";
+  // Same row key and should not be inserted again.
+  get.referenced_key = kRefKeyPrefix + "4" + kRefKeySequenceNumber;
+  get.referenced_data_size = 1;
+  cache_simulator->Access(get);
+  AssertCache(sim_cache, cache_simulator->miss_ratio_stats(), 16, 103, 99, {},
+              {});
+  for (auto const& key : {"1", "2", "4"}) {
+    auto handle = sim_cache->Lookup("0_" + kRefKeyPrefix + key);
+    ASSERT_EQ(nullptr, handle);
+  }
+}
+
+TEST_F(CacheSimulatorTest, HybridRowBlockNoInsertCacheSimulator) {
+  uint64_t block_id = 100;
+  BlockCacheTraceRecord first_get = GenerateGetRecord(kGetId);
+  std::shared_ptr<Cache> sim_cache =
+      NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          nullptr, sim_cache, /*insert_blocks_row_kvpair_misses=*/false));
+  for (uint32_t i = 0; i < 9; i++) {
+    first_get.block_key = kBlockKeyPrefix + std::to_string(block_id);
+    cache_simulator->Access(first_get);
+    block_id++;
+  }
+  auto handle =
+      sim_cache->Lookup(std::to_string(first_get.sst_fd_number) + "_" +
+                        ExtractUserKey(first_get.referenced_key).ToString());
+  ASSERT_NE(nullptr, handle);
+  sim_cache->Release(handle);
+  // All blocks are missing from the cache since insert_blocks_row_kvpair_misses
+  // is set to false.
+  for (uint32_t i = 100; i < block_id; i++) {
+    handle = sim_cache->Lookup(kBlockKeyPrefix + std::to_string(i));
+    ASSERT_EQ(nullptr, handle);
+  }
+}
+
+TEST_F(CacheSimulatorTest, GhostHybridRowBlockCacheSimulator) {
+  std::unique_ptr<GhostCache> ghost_cache(new GhostCache(
+      NewLRUCache(/*capacity=*/kGhostCacheSize, /*num_shard_bits=*/1,
+                  /*strict_capacity_limit=*/false,
+                  /*high_pri_pool_ratio=*/0)));
+  const BlockCacheTraceRecord& first_get = GenerateGetRecord(kGetId);
+  const BlockCacheTraceRecord& second_get = GenerateGetRecord(kGetId + 1);
+  const BlockCacheTraceRecord& third_get = GenerateGetRecord(kGetId + 2);
+  std::unique_ptr<HybridRowBlockCacheSimulator> cache_simulator(
+      new HybridRowBlockCacheSimulator(
+          std::move(ghost_cache),
+          NewLRUCache(/*capacity=*/kCacheSize, /*num_shard_bits=*/1,
+                      /*strict_capacity_limit=*/false,
+                      /*high_pri_pool_ratio=*/0),
+          /*insert_blocks_row_kvpair_misses=*/false));
+  // Two get requests access the same key.
+  cache_simulator->Access(first_get);
+  cache_simulator->Access(second_get);
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().miss_ratio());
+  ASSERT_EQ(2, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(100, cache_simulator->miss_ratio_stats().user_miss_ratio());
+  // We insert the key-value pair upon the second get request. A third get
+  // request should observe a hit.
+  for (uint32_t i = 0; i < 10; i++) {
+    cache_simulator->Access(third_get);
+  }
+  ASSERT_EQ(12, cache_simulator->miss_ratio_stats().total_accesses());
+  ASSERT_EQ(16, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().miss_ratio()));
+  ASSERT_EQ(12, cache_simulator->miss_ratio_stats().user_accesses());
+  ASSERT_EQ(16, static_cast<uint64_t>(
+                    cache_simulator->miss_ratio_stats().user_miss_ratio()));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc
index 8629b60b095..cf0390a1bd7 100644
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@@ -5,10 +5,10 @@
 
 #include "rocksdb/utilities/sim_cache.h"
 #include <atomic>
+#include "file/writable_file_writer.h"
 #include "monitoring/statistics.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
-#include "util/file_reader_writer.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
 
@@ -152,10 +152,9 @@ class SimCacheImpl : public SimCache {
  public:
   // capacity for real cache (ShardedLRUCache)
   // test_capacity for key only cache
-  SimCacheImpl(std::shared_ptr<Cache> cache, size_t sim_capacity,
-               int num_shard_bits)
+  SimCacheImpl(std::shared_ptr<Cache> sim_cache, std::shared_ptr<Cache> cache)
       : cache_(cache),
-        key_only_cache_(NewLRUCache(sim_capacity, num_shard_bits)),
+        key_only_cache_(sim_cache),
         miss_times_(0),
         hit_times_(0),
         stats_(nullptr) {}
@@ -185,7 +184,9 @@ class SimCacheImpl : public SimCache {
     }
 
     cache_activity_logger_.ReportAdd(key, charge);
-
+    if (!cache_) {
+      return Status::OK();
+    }
     return cache_->Insert(key, value, charge, deleter, handle, priority);
   }
 
@@ -201,7 +202,9 @@ class SimCacheImpl : public SimCache {
     }
 
     cache_activity_logger_.ReportLookup(key);
-
+    if (!cache_) {
+      return nullptr;
+    }
     return cache_->Lookup(key, stats);
   }
 
@@ -232,6 +235,10 @@ class SimCacheImpl : public SimCache {
     return cache_->GetUsage(handle);
   }
 
+  size_t GetCharge(Handle* handle) const override {
+    return cache_->GetCharge(handle);
+  }
+
   size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
 
   void DisownData() override {
@@ -326,10 +333,20 @@ class SimCacheImpl : public SimCache {
 // For instrumentation purpose, use NewSimCache instead
 std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> cache,
                                       size_t sim_capacity, int num_shard_bits) {
+  LRUCacheOptions co;
+  co.capacity = sim_capacity;
+  co.num_shard_bits = num_shard_bits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  return NewSimCache(NewLRUCache(co), cache, num_shard_bits);
+}
+
+std::shared_ptr<SimCache> NewSimCache(std::shared_ptr<Cache> sim_cache,
+                                      std::shared_ptr<Cache> cache,
+                                      int num_shard_bits) {
   if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
-  return std::make_shared<SimCacheImpl>(cache, sim_capacity, num_shard_bits);
+  return std::make_shared<SimCacheImpl>(sim_cache, cache);
 }
 
 }  // end namespace rocksdb
diff --git a/utilities/simulator_cache/sim_cache_test.cc b/utilities/simulator_cache/sim_cache_test.cc
index 7f0f904a720..e66228107ea 100644
--- a/utilities/simulator_cache/sim_cache_test.cc
+++ b/utilities/simulator_cache/sim_cache_test.cc
@@ -77,8 +77,12 @@ TEST_F(SimCacheTest, SimCache) {
   auto table_options = GetTableOptions();
   auto options = GetOptions(table_options);
   InitTable(options);
-  std::shared_ptr<SimCache> simCache =
-      NewSimCache(NewLRUCache(0, 0, false), 20000, 0);
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<SimCache> simCache = NewSimCache(NewLRUCache(co), 20000, 0);
   table_options.block_cache = simCache;
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   Reopen(options);
@@ -142,8 +146,10 @@ TEST_F(SimCacheTest, SimCacheLogging) {
   auto table_options = GetTableOptions();
   auto options = GetOptions(table_options);
   options.disable_auto_compactions = true;
-  std::shared_ptr<SimCache> sim_cache =
-      NewSimCache(NewLRUCache(1024 * 1024), 20000, 0);
+  LRUCacheOptions co;
+  co.capacity = 1024 * 1024;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<SimCache> sim_cache = NewSimCache(NewLRUCache(co), 20000, 0);
   table_options.block_cache = sim_cache;
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   Reopen(options);
diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
index 101aa988b66..57eed107011 100644
--- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
+++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
@@ -40,7 +40,7 @@ int main(int /*argc*/, char** /*argv*/) {
   // randomize tests
   rocksdb::Random rnd(301);
   const int kMaxTestSize = 100000l;
-  for (int random_test = 0; random_test < 30; random_test++) {
+  for (int random_test = 0; random_test < 10; random_test++) {
     int window_size = rnd.Uniform(kMaxTestSize) + 1;
     int deletion_trigger = rnd.Uniform(window_size);
     window_sizes.emplace_back(window_size);
diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc
index 4a81516a8b7..2910745e20b 100644
--- a/utilities/trace/file_trace_reader_writer.cc
+++ b/utilities/trace/file_trace_reader_writer.cc
@@ -5,9 +5,10 @@
 
 #include "utilities/trace/file_trace_reader_writer.h"
 
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "trace_replay/trace_replay.h"
 #include "util/coding.h"
-#include "util/file_reader_writer.h"
-#include "util/trace_replay.h"
 
 namespace rocksdb {
 
diff --git a/utilities/transactions/optimistic_transaction.cc b/utilities/transactions/optimistic_transaction.cc
index 48c9180ae9e..e8cf6eade4e 100644
--- a/utilities/transactions/optimistic_transaction.cc
+++ b/utilities/transactions/optimistic_transaction.cc
@@ -10,7 +10,7 @@
 #include <string>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
diff --git a/utilities/transactions/optimistic_transaction.h b/utilities/transactions/optimistic_transaction.h
index 445979b9616..b6e249f3ca8 100644
--- a/utilities/transactions/optimistic_transaction.h
+++ b/utilities/transactions/optimistic_transaction.h
@@ -31,6 +31,9 @@ class OptimisticTransaction : public TransactionBaseImpl {
   OptimisticTransaction(OptimisticTransactionDB* db,
                         const WriteOptions& write_options,
                         const OptimisticTransactionOptions& txn_options);
+  // No copying allowed
+  OptimisticTransaction(const OptimisticTransaction&) = delete;
+  void operator=(const OptimisticTransaction&) = delete;
 
   virtual ~OptimisticTransaction();
 
@@ -52,7 +55,7 @@ class OptimisticTransaction : public TransactionBaseImpl {
                  const bool assume_tracked = false) override;
 
  private:
-  OptimisticTransactionDB* const txn_db_;
+  ROCKSDB_FIELD_UNUSED OptimisticTransactionDB* const txn_db_;
 
   friend class OptimisticTransactionCallback;
 
@@ -71,10 +74,6 @@ class OptimisticTransaction : public TransactionBaseImpl {
                           const Slice& /* unused */) override {
     // Nothing to unlock.
   }
-
-  // No copying allowed
-  OptimisticTransaction(const OptimisticTransaction&);
-  void operator=(const OptimisticTransaction&);
 };
 
 // Used at commit time to trigger transaction validation
diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc
index d9db6fde07e..0fb7c9100fe 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.cc
+++ b/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -10,7 +10,7 @@
 #include <string>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
@@ -63,9 +63,11 @@ Status OptimisticTransactionDB::Open(
   for (auto& column_family : column_families_copy) {
     ColumnFamilyOptions* options = &column_family.options;
 
-    if (options->max_write_buffer_number_to_maintain == 0) {
-      // Setting to -1 will set the History size to max_write_buffer_number.
-      options->max_write_buffer_number_to_maintain = -1;
+    if (options->max_write_buffer_size_to_maintain == 0 &&
+        options->max_write_buffer_number_to_maintain == 0) {
+      // Setting to -1 will set the History size to
+      // max_write_buffer_number * write_buffer_size.
+      options->max_write_buffer_size_to_maintain = -1;
     }
   }
 
diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc
index fbb0d44fdc7..35495e9e159 100644
--- a/utilities/transactions/optimistic_transaction_test.cc
+++ b/utilities/transactions/optimistic_transaction_test.cc
@@ -9,15 +9,18 @@
 #include <string>
 #include <thread>
 
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
 #include "rocksdb/db.h"
+#include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/transaction.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/transaction_test_util.h"
 #include "util/crc32c.h"
-#include "util/logging.h"
 #include "util/random.h"
-#include "util/testharness.h"
-#include "util/transaction_test_util.h"
-#include "port/port.h"
 
 using std::string;
 
@@ -32,6 +35,7 @@ class OptimisticTransactionTest : public testing::Test {
   OptimisticTransactionTest() {
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
+    options.max_write_buffer_size_to_maintain = 1600;
     dbname = test::PerThreadDBPath("optimistic_transaction_testdb");
 
     DestroyDB(dbname, options);
@@ -308,6 +312,120 @@ TEST_F(OptimisticTransactionTest, FlushTest2) {
   delete txn;
 }
 
+// Trigger the condition where some old memtables are skipped when doing
+// TransactionUtil::CheckKey(), and make sure the result is still correct.
+TEST_F(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
+  const int kAttemptHistoryMemtable = 0;
+  const int kAttemptImmMemTable = 1;
+  for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
+       attempt++) {
+    options.max_write_buffer_number_to_maintain = 3;
+    Reopen();
+
+    WriteOptions write_options;
+    ReadOptions read_options;
+    ReadOptions snapshot_read_options;
+    ReadOptions snapshot_read_options2;
+    string value;
+    Status s;
+
+    ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+    ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+    Transaction* txn = txn_db->BeginTransaction(write_options);
+    ASSERT_TRUE(txn != nullptr);
+
+    Transaction* txn2 = txn_db->BeginTransaction(write_options);
+    ASSERT_TRUE(txn2 != nullptr);
+
+    snapshot_read_options.snapshot = txn->GetSnapshot();
+    ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
+    ASSERT_EQ(value, "bar");
+    ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
+
+    snapshot_read_options2.snapshot = txn2->GetSnapshot();
+    ASSERT_OK(txn2->GetForUpdate(snapshot_read_options2, "foo2", &value));
+    ASSERT_EQ(value, "bar");
+    ASSERT_OK(txn2->Put(Slice("foo2"), Slice("bar2")));
+
+    // txn updates "foo" and txn2 updates "foo2", and now a write is
+    // issued for "foo", which conflicts with txn but not txn2
+    ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+
+    if (attempt == kAttemptImmMemTable) {
+      // For the second attempt, hold flush from beginning. The memtable
+      // will be switched to immutable after calling TEST_SwitchMemtable()
+      // while CheckKey() is called.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"OptimisticTransactionTest.CheckKeySkipOldMemtable",
+            "FlushJob::Start"}});
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    }
+
+    // force a memtable flush. The memtable should still be kept
+    FlushOptions flush_ops;
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_OK(txn_db->Flush(flush_ops));
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      DBImpl* db_impl = static_cast<DBImpl*>(txn_db->GetRootDB());
+      db_impl->TEST_SwitchMemtable();
+    }
+    uint64_t num_imm_mems;
+    ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
+                                       &num_imm_mems));
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_EQ(0, num_imm_mems);
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(1, num_imm_mems);
+    }
+
+    // Put something in active memtable
+    ASSERT_OK(txn_db->Put(write_options, Slice("foo3"), Slice("bar")));
+
+    // Create txn3 after flushing, when this transaction is commited,
+    // only need to check the active memtable
+    Transaction* txn3 = txn_db->BeginTransaction(write_options);
+    ASSERT_TRUE(txn3 != nullptr);
+
+    // Commit both of txn and txn2. txn will conflict but txn2 will
+    // pass. In both ways, both memtables are queried.
+    SetPerfLevel(PerfLevel::kEnableCount);
+
+    get_perf_context()->Reset();
+    s = txn->Commit();
+    // We should have checked two memtables
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+    // txn should fail because of conflict, even if the memtable
+    // has flushed, because it is still preserved in history.
+    ASSERT_TRUE(s.IsBusy());
+
+    get_perf_context()->Reset();
+    s = txn2->Commit();
+    // We should have checked two memtables
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+    ASSERT_TRUE(s.ok());
+
+    txn3->Put(Slice("foo2"), Slice("bar2"));
+    get_perf_context()->Reset();
+    s = txn3->Commit();
+    // txn3 is created after the active memtable is created, so that is the only
+    // memtable to check.
+    ASSERT_EQ(1, get_perf_context()->get_from_memtable_count);
+    ASSERT_TRUE(s.ok());
+
+    TEST_SYNC_POINT("OptimisticTransactionTest.CheckKeySkipOldMemtable");
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+    SetPerfLevel(PerfLevel::kDisable);
+
+    delete txn;
+    delete txn2;
+    delete txn3;
+  }
+}
+
 TEST_F(OptimisticTransactionTest, NoSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options;
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index d0e4f20467b..c709d34b1c5 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -13,15 +13,15 @@
 #include <vector>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/snapshot.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
 #include "utilities/transactions/transaction_util.h"
 
@@ -231,7 +231,8 @@ Status WriteCommittedTxn::PrepareInternal() {
       (void)two_write_queues_;  // to silence unused private field warning
     }
     virtual Status Callback(SequenceNumber, bool is_mem_disabled,
-                            uint64_t log_number) override {
+                            uint64_t log_number, size_t /*index*/,
+                            size_t /*total*/) override {
 #ifdef NDEBUG
       (void)is_mem_disabled;
 #endif
@@ -472,10 +473,11 @@ Status PessimisticTransaction::LockBatch(WriteBatch* batch,
     void RecordKey(uint32_t column_family_id, const Slice& key) {
       std::string key_str = key.ToString();
 
-      auto iter = (keys_)[column_family_id].find(key_str);
-      if (iter == (keys_)[column_family_id].end()) {
+      auto& cfh_keys = keys_[column_family_id];
+      auto iter = cfh_keys.find(key_str);
+      if (iter == cfh_keys.end()) {
         // key not yet seen, store it.
-        (keys_)[column_family_id].insert({std::move(key_str)});
+        cfh_keys.insert({std::move(key_str)});
       }
     }
 
diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h
index 1f851818e9d..c91d9217b2b 100644
--- a/utilities/transactions/pessimistic_transaction.h
+++ b/utilities/transactions/pessimistic_transaction.h
@@ -40,6 +40,9 @@ class PessimisticTransaction : public TransactionBaseImpl {
   PessimisticTransaction(TransactionDB* db, const WriteOptions& write_options,
                          const TransactionOptions& txn_options,
                          const bool init = true);
+  // No copying allowed
+  PessimisticTransaction(const PessimisticTransaction&) = delete;
+  void operator=(const PessimisticTransaction&) = delete;
 
   virtual ~PessimisticTransaction();
 
@@ -193,16 +196,15 @@ class PessimisticTransaction : public TransactionBaseImpl {
 
   void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
                           const Slice& key) override;
-
-  // No copying allowed
-  PessimisticTransaction(const PessimisticTransaction&);
-  void operator=(const PessimisticTransaction&);
 };
 
 class WriteCommittedTxn : public PessimisticTransaction {
  public:
   WriteCommittedTxn(TransactionDB* db, const WriteOptions& write_options,
                     const TransactionOptions& txn_options);
+  // No copying allowed
+  WriteCommittedTxn(const WriteCommittedTxn&) = delete;
+  void operator=(const WriteCommittedTxn&) = delete;
 
   virtual ~WriteCommittedTxn() {}
 
@@ -216,10 +218,6 @@ class WriteCommittedTxn : public PessimisticTransaction {
   Status CommitInternal() override;
 
   Status RollbackInternal() override;
-
-  // No copying allowed
-  WriteCommittedTxn(const WriteCommittedTxn&);
-  void operator=(const WriteCommittedTxn&);
 };
 
 }  // namespace rocksdb
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 8eb21777a99..4aee1147bbd 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -5,24 +5,20 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/pessimistic_transaction_db.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 #include <unordered_set>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
-#include "util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/transaction_db_mutex_impl.h"
 #include "utilities/transactions/write_prepared_txn_db.h"
@@ -117,11 +113,11 @@ Status PessimisticTransactionDB::Initialize(
   Status s = EnableAutoCompaction(compaction_enabled_cf_handles);
 
   // create 'real' transactions from recovered shell transactions
-  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
+  auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
   assert(dbimpl != nullptr);
   auto rtrxs = dbimpl->recovered_transactions();
 
-  for (auto it = rtrxs.begin(); it != rtrxs.end(); it++) {
+  for (auto it = rtrxs.begin(); it != rtrxs.end(); ++it) {
     auto recovered_trx = it->second;
     assert(recovered_trx);
     assert(recovered_trx->batches_.size() == 1);
@@ -221,9 +217,24 @@ Status TransactionDB::Open(
     std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
   Status s;
   DB* db = nullptr;
+  if (txn_db_options.write_policy == WRITE_COMMITTED &&
+      db_options.unordered_write) {
+    return Status::NotSupported(
+        "WRITE_COMMITTED is incompatible with unordered_writes");
+  }
+  if (txn_db_options.write_policy == WRITE_UNPREPARED &&
+      db_options.unordered_write) {
+    // TODO(lth): support it
+    return Status::NotSupported(
+        "WRITE_UNPREPARED is currently incompatible with unordered_writes");
+  }
+  if (txn_db_options.write_policy == WRITE_PREPARED &&
+      db_options.unordered_write && !db_options.two_write_queues) {
+    return Status::NotSupported(
+        "WRITE_PREPARED is incompatible with unordered_writes if "
+        "two_write_queues is not enabled.");
+  }
 
-  ROCKS_LOG_WARN(db_options.info_log, "Transaction write_policy is %" PRId32,
-                 static_cast<int>(txn_db_options.write_policy));
   std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
   std::vector<size_t> compaction_enabled_cf_indices;
   DBOptions db_options_2pc = db_options;
@@ -238,6 +249,9 @@ Status TransactionDB::Open(
   s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db,
                    use_seq_per_batch, use_batch_per_txn);
   if (s.ok()) {
+    ROCKS_LOG_WARN(db->GetDBOptions().info_log,
+                   "Transaction write_policy is %" PRId32,
+                   static_cast<int>(txn_db_options.write_policy));
     s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
                dbptr);
   }
@@ -257,9 +271,11 @@ void TransactionDB::PrepareWrap(
   for (size_t i = 0; i < column_families->size(); i++) {
     ColumnFamilyOptions* cf_options = &(*column_families)[i].options;
 
-    if (cf_options->max_write_buffer_number_to_maintain == 0) {
-      // Setting to -1 will set the History size to max_write_buffer_number.
-      cf_options->max_write_buffer_number_to_maintain = -1;
+    if (cf_options->max_write_buffer_size_to_maintain == 0 &&
+        cf_options->max_write_buffer_number_to_maintain == 0) {
+      // Setting to -1 will set the History size to
+      // max_write_buffer_number * write_buffer_size.
+      cf_options->max_write_buffer_size_to_maintain = -1;
     }
     if (!cf_options->disable_auto_compactions) {
       // Disable compactions momentarily to prevent race with DB::Open
@@ -504,23 +520,16 @@ Status PessimisticTransactionDB::Merge(const WriteOptions& options,
 
 Status PessimisticTransactionDB::Write(const WriteOptions& opts,
                                        WriteBatch* updates) {
-  // Need to lock all keys in this batch to prevent write conflicts with
-  // concurrent transactions.
-  Transaction* txn = BeginInternalTransaction(opts);
-  txn->DisableIndexing();
-
-  auto txn_impl =
-      static_cast_with_check<PessimisticTransaction, Transaction>(txn);
-
-  // Since commitBatch sorts the keys before locking, concurrent Write()
-  // operations will not cause a deadlock.
-  // In order to avoid a deadlock with a concurrent Transaction, Transactions
-  // should use a lock timeout.
-  Status s = txn_impl->CommitBatch(updates);
-
-  delete txn;
+  return WriteWithConcurrencyControl(opts, updates);
+}
 
-  return s;
+Status WriteCommittedTxnDB::Write(const WriteOptions& opts,
+                                  WriteBatch* updates) {
+  if (txn_db_options_.skip_concurrency_control) {
+    return db_impl_->Write(opts, updates);
+  } else {
+    return WriteWithConcurrencyControl(opts, updates);
+  }
 }
 
 Status WriteCommittedTxnDB::Write(
@@ -529,7 +538,7 @@ Status WriteCommittedTxnDB::Write(
   if (optimizations.skip_concurrency_control) {
     return db_impl_->Write(opts, updates);
   } else {
-    return Write(opts, updates);
+    return WriteWithConcurrencyControl(opts, updates);
   }
 }
 
@@ -582,7 +591,7 @@ void PessimisticTransactionDB::GetAllPreparedTransactions(
   assert(transv);
   transv->clear();
   std::lock_guard<std::mutex> lock(name_map_mutex_);
-  for (auto it = transactions_.begin(); it != transactions_.end(); it++) {
+  for (auto it = transactions_.begin(); it != transactions_.end(); ++it) {
     if (it->second->GetState() == Transaction::PREPARED) {
       transv->push_back(it->second);
     }
diff --git a/utilities/transactions/pessimistic_transaction_db.h b/utilities/transactions/pessimistic_transaction_db.h
index e80b28852e7..5242c6260b1 100644
--- a/utilities/transactions/pessimistic_transaction_db.h
+++ b/utilities/transactions/pessimistic_transaction_db.h
@@ -19,6 +19,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/transaction_lock_mgr.h"
 #include "utilities/transactions/write_prepared_txn.h"
@@ -67,6 +68,26 @@ class PessimisticTransactionDB : public TransactionDB {
 
   using TransactionDB::Write;
   virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+  inline Status WriteWithConcurrencyControl(const WriteOptions& opts,
+                                            WriteBatch* updates) {
+    // Need to lock all keys in this batch to prevent write conflicts with
+    // concurrent transactions.
+    Transaction* txn = BeginInternalTransaction(opts);
+    txn->DisableIndexing();
+
+    auto txn_impl =
+        static_cast_with_check<PessimisticTransaction, Transaction>(txn);
+
+    // Since commitBatch sorts the keys before locking, concurrent Write()
+    // operations will not cause a deadlock.
+    // In order to avoid a deadlock with a concurrent Transaction, Transactions
+    // should use a lock timeout.
+    Status s = txn_impl->CommitBatch(updates);
+
+    delete txn;
+
+    return s;
+  }
 
   using StackableDB::CreateColumnFamily;
   virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
@@ -191,6 +212,7 @@ class WriteCommittedTxnDB : public PessimisticTransactionDB {
   virtual Status Write(const WriteOptions& opts,
                        const TransactionDBWriteOptimizations& optimizations,
                        WriteBatch* updates) override;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
 };
 
 }  //  namespace rocksdb
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 68b87b5aa47..54b5b6204ac 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -7,17 +7,14 @@
 
 #include "utilities/transactions/transaction_base.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 
-#include "db/db_impl.h"
 #include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
+#include "util/cast_util.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -25,7 +22,7 @@ namespace rocksdb {
 TransactionBaseImpl::TransactionBaseImpl(DB* db,
                                          const WriteOptions& write_options)
     : db_(db),
-      dbimpl_(reinterpret_cast<DBImpl*>(db)),
+      dbimpl_(static_cast_with_check<DBImpl, DB>(db)),
       write_options_(write_options),
       cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())),
       start_time_(db_->GetEnv()->NowMicros()),
@@ -34,7 +31,7 @@ TransactionBaseImpl::TransactionBaseImpl(DB* db,
   assert(dynamic_cast<DBImpl*>(db_) != nullptr);
   log_number_ = 0;
   if (dbimpl_->allow_2pc()) {
-    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    InitWriteBatch();
   }
 }
 
@@ -53,7 +50,7 @@ void TransactionBaseImpl::Clear() {
   num_merges_ = 0;
 
   if (dbimpl_->allow_2pc()) {
-    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    InitWriteBatch();
   }
 }
 
@@ -173,7 +170,7 @@ Status TransactionBaseImpl::RollbackToSavePoint() {
         }
         if (tracked_keys_iter->second.num_reads == 0 &&
             tracked_keys_iter->second.num_writes == 0) {
-          tracked_keys_[column_family_id].erase(tracked_keys_iter);
+          cf_tracked_keys.erase(tracked_keys_iter);
         }
       }
     }
@@ -196,7 +193,48 @@ Status TransactionBaseImpl::PopSavePoint() {
   }
 
   assert(!save_points_->empty());
-  save_points_->pop();
+  // If there is another savepoint A below the current savepoint B, then A needs
+  // to inherit tracked_keys in B so that if we rollback to savepoint A, we
+  // remember to unlock keys in B. If there is no other savepoint below, then we
+  // can safely discard savepoint info.
+  if (save_points_->size() == 1) {
+    save_points_->pop();
+  } else {
+    TransactionBaseImpl::SavePoint top;
+    std::swap(top, save_points_->top());
+    save_points_->pop();
+
+    const TransactionKeyMap& curr_cf_key_map = top.new_keys_;
+    TransactionKeyMap& prev_cf_key_map = save_points_->top().new_keys_;
+
+    for (const auto& curr_cf_key_iter : curr_cf_key_map) {
+      uint32_t column_family_id = curr_cf_key_iter.first;
+      const std::unordered_map<std::string, TransactionKeyMapInfo>& curr_keys =
+          curr_cf_key_iter.second;
+
+      // If cfid was not previously tracked, just copy everything over.
+      auto prev_keys_iter = prev_cf_key_map.find(column_family_id);
+      if (prev_keys_iter == prev_cf_key_map.end()) {
+        prev_cf_key_map.emplace(curr_cf_key_iter);
+      } else {
+        std::unordered_map<std::string, TransactionKeyMapInfo>& prev_keys =
+            prev_keys_iter->second;
+        for (const auto& key_iter : curr_keys) {
+          const std::string& key = key_iter.first;
+          const TransactionKeyMapInfo& info = key_iter.second;
+          // If key was not previously tracked, just copy the whole struct over.
+          // Otherwise, some merging needs to occur.
+          auto prev_info = prev_keys.find(key);
+          if (prev_info == prev_keys.end()) {
+            prev_keys.emplace(key_iter);
+          } else {
+            prev_info->second.Merge(info);
+          }
+        }
+      }
+    }
+  }
+
   return write_batch_.PopSavePoint();
 }
 
@@ -285,7 +323,7 @@ void TransactionBaseImpl::MultiGet(const ReadOptions& read_options,
                                    ColumnFamilyHandle* column_family,
                                    const size_t num_keys, const Slice* keys,
                                    PinnableSlice* values, Status* statuses,
-                                   bool sorted_input) {
+                                   const bool sorted_input) {
   write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family,
                                       num_keys, keys, values, statuses,
                                       sorted_input);
@@ -331,7 +369,8 @@ Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options,
   Iterator* db_iter = db_->NewIterator(read_options, column_family);
   assert(db_iter);
 
-  return write_batch_.NewIteratorWithBase(column_family, db_iter);
+  return write_batch_.NewIteratorWithBase(column_family, db_iter,
+                                          &read_options);
 }
 
 Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
@@ -595,6 +634,16 @@ void TransactionBaseImpl::TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id,
                                    const std::string& key, SequenceNumber seq,
                                    bool read_only, bool exclusive) {
   auto& cf_key_map = (*key_map)[cfh_id];
+#ifdef __cpp_lib_unordered_map_try_emplace
+  // use c++17's try_emplace if available, to avoid rehashing the key
+  // in case it is not already in the map
+  auto result = cf_key_map.try_emplace(key, seq);
+  auto iter = result.first;
+  if (!result.second && seq < iter->second.seq) {
+    // Now tracking this key with an earlier sequence number
+    iter->second.seq = seq;
+  }
+#else
   auto iter = cf_key_map.find(key);
   if (iter == cf_key_map.end()) {
     auto result = cf_key_map.emplace(key, TransactionKeyMapInfo(seq));
@@ -603,10 +652,11 @@ void TransactionBaseImpl::TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id,
     // Now tracking this key with an earlier sequence number
     iter->second.seq = seq;
   }
+#endif
   // else we do not update the seq. The smaller the tracked seq, the stronger it
   // the guarantee since it implies from the seq onward there has not been a
   // concurrent update to the key. So we update the seq if it implies stronger
-  // guarantees, i.e., if it is smaller than the existing trakced seq.
+  // guarantees, i.e., if it is smaller than the existing tracked seq.
 
   if (read_only) {
     iter->second.num_reads++;
diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h
index 04274866aab..082b0b4f5b9 100644
--- a/utilities/transactions/transaction_base.h
+++ b/utilities/transactions/transaction_base.h
@@ -11,6 +11,7 @@
 #include <string>
 #include <vector>
 
+#include "db/write_batch_internal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/snapshot.h"
@@ -97,7 +98,7 @@ class TransactionBaseImpl : public Transaction {
 
   void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family,
                 const size_t num_keys, const Slice* keys, PinnableSlice* values,
-                Status* statuses, bool sorted_input = false) override;
+                Status* statuses, const bool sorted_input = false) override;
 
   using Transaction::MultiGetForUpdate;
   std::vector<Status> MultiGetForUpdate(
@@ -273,6 +274,15 @@ class TransactionBaseImpl : public Transaction {
   // Sets a snapshot if SetSnapshotOnNextOperation() has been called.
   void SetSnapshotIfNeeded();
 
+  // Initialize write_batch_ for 2PC by inserting Noop.
+  inline void InitWriteBatch(bool clear = false) {
+    if (clear) {
+      write_batch_.Clear();
+    }
+    assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader);
+    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+  }
+
   DB* db_;
   DBImpl* dbimpl_;
 
@@ -294,11 +304,11 @@ class TransactionBaseImpl : public Transaction {
 
   struct SavePoint {
     std::shared_ptr<const Snapshot> snapshot_;
-    bool snapshot_needed_;
+    bool snapshot_needed_ = false;
     std::shared_ptr<TransactionNotifier> snapshot_notifier_;
-    uint64_t num_puts_;
-    uint64_t num_deletes_;
-    uint64_t num_merges_;
+    uint64_t num_puts_ = 0;
+    uint64_t num_deletes_ = 0;
+    uint64_t num_merges_ = 0;
 
     // Record all keys tracked since the last savepoint
     TransactionKeyMap new_keys_;
@@ -312,27 +322,31 @@ class TransactionBaseImpl : public Transaction {
           num_puts_(num_puts),
           num_deletes_(num_deletes),
           num_merges_(num_merges) {}
+
+    SavePoint() = default;
   };
 
   // Records writes pending in this transaction
   WriteBatchWithIndex write_batch_;
 
- private:
-  friend class WritePreparedTxn;
-  // Extra data to be persisted with the commit. Note this is only used when
-  // prepare phase is not skipped.
-  WriteBatch commit_time_batch_;
-
-  // Stack of the Snapshot saved at each save point.  Saved snapshots may be
-  // nullptr if there was no snapshot at the time SetSavePoint() was called.
-  std::unique_ptr<std::stack<TransactionBaseImpl::SavePoint, autovector<TransactionBaseImpl::SavePoint>>> save_points_;
-
   // Map from column_family_id to map of keys that are involved in this
   // transaction.
   // For Pessimistic Transactions this is the list of locked keys.
   // Optimistic Transactions will wait till commit time to do conflict checking.
   TransactionKeyMap tracked_keys_;
 
+  // Stack of the Snapshot saved at each save point. Saved snapshots may be
+  // nullptr if there was no snapshot at the time SetSavePoint() was called.
+  std::unique_ptr<std::stack<TransactionBaseImpl::SavePoint,
+                             autovector<TransactionBaseImpl::SavePoint>>>
+      save_points_;
+
+ private:
+  friend class WritePreparedTxn;
+  // Extra data to be persisted with the commit. Note this is only used when
+  // prepare phase is not skipped.
+  WriteBatch commit_time_batch_;
+
   // If true, future Put/Merge/Deletes will be indexed in the
   // WriteBatchWithIndex.
   // If false, future Put/Merge/Deletes will be inserted directly into the
diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc
index 48d496bfd7f..45d04d188fa 100644
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/transaction_lock_mgr.cc
@@ -5,13 +5,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_lock_mgr.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 
 #include <algorithm>
 #include <condition_variable>
@@ -23,9 +19,9 @@
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/utilities/transaction_db_mutex.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/hash.h"
-#include "util/sync_point.h"
 #include "util/thread_local.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
 
@@ -183,8 +179,7 @@ TransactionLockMgr::~TransactionLockMgr() {}
 
 size_t LockMap::GetStripe(const std::string& key) const {
   assert(num_stripes_ > 0);
-  size_t stripe = static_cast<size_t>(GetSliceNPHash64(key)) % num_stripes_;
-  return stripe;
+  return fastrange64(GetSliceNPHash64(key), num_stripes_);
 }
 
 void TransactionLockMgr::AddColumnFamily(uint32_t column_family_id) {
diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/transaction_lock_mgr.h
index b1235760421..16a40d7b7a1 100644
--- a/utilities/transactions/transaction_lock_mgr.h
+++ b/utilities/transactions/transaction_lock_mgr.h
@@ -57,6 +57,9 @@ class TransactionLockMgr {
   TransactionLockMgr(TransactionDB* txn_db, size_t default_num_stripes,
                      int64_t max_num_locks, uint32_t max_num_deadlocks,
                      std::shared_ptr<TransactionDBMutexFactory> factory);
+  // No copying allowed
+  TransactionLockMgr(const TransactionLockMgr&) = delete;
+  void operator=(const TransactionLockMgr&) = delete;
 
   ~TransactionLockMgr();
 
@@ -149,10 +152,6 @@ class TransactionLockMgr {
                         const autovector<TransactionID>& wait_ids);
   void DecrementWaitersImpl(const PessimisticTransaction* txn,
                             const autovector<TransactionID>& wait_ids);
-
-  // No copying allowed
-  TransactionLockMgr(const TransactionLockMgr&);
-  void operator=(const TransactionLockMgr&);
 };
 
 }  //  namespace rocksdb
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 1a5bf2d6644..fd849011919 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -5,10 +5,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_test.h"
 
 #include <algorithm>
@@ -16,20 +12,20 @@
 #include <string>
 #include <thread>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/transaction_test_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
@@ -42,40 +38,48 @@ namespace rocksdb {
 
 INSTANTIATE_TEST_CASE_P(
     DBAsBaseDB, TransactionTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED),
-                      std::make_tuple(false, true, WRITE_COMMITTED),
-                      std::make_tuple(false, false, WRITE_PREPARED),
-                      std::make_tuple(false, true, WRITE_PREPARED),
-                      std::make_tuple(false, false, WRITE_UNPREPARED),
-                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
 INSTANTIATE_TEST_CASE_P(
     DBAsBaseDB, TransactionStressTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED),
-                      std::make_tuple(false, true, WRITE_COMMITTED),
-                      std::make_tuple(false, false, WRITE_PREPARED),
-                      std::make_tuple(false, true, WRITE_PREPARED),
-                      std::make_tuple(false, false, WRITE_UNPREPARED),
-                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
 INSTANTIATE_TEST_CASE_P(
     StackableDBAsBaseDB, TransactionTest,
-    ::testing::Values(std::make_tuple(true, true, WRITE_COMMITTED),
-                      std::make_tuple(true, true, WRITE_PREPARED),
-                      std::make_tuple(true, true, WRITE_UNPREPARED)));
+    ::testing::Values(
+        std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite),
+        std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)));
 
 // MySQLStyleTransactionTest takes far too long for valgrind to run.
 #ifndef ROCKSDB_VALGRIND_RUN
 INSTANTIATE_TEST_CASE_P(
     MySQLStyleTransactionTest, MySQLStyleTransactionTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED, false),
-                      std::make_tuple(false, true, WRITE_COMMITTED, false),
-                      std::make_tuple(false, false, WRITE_PREPARED, false),
-                      std::make_tuple(false, false, WRITE_PREPARED, true),
-                      std::make_tuple(false, true, WRITE_PREPARED, false),
-                      std::make_tuple(false, true, WRITE_PREPARED, true),
-                      std::make_tuple(false, false, WRITE_UNPREPARED, false),
-                      std::make_tuple(false, false, WRITE_UNPREPARED, true),
-                      std::make_tuple(false, true, WRITE_UNPREPARED, false),
-                      std::make_tuple(false, true, WRITE_UNPREPARED, true)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
+        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
+        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(TransactionTest, DoubleEmptyWrite) {
@@ -223,8 +227,10 @@ TEST_P(TransactionTest, ValidateSnapshotTest) {
         db_impl->TEST_FlushMemTable(true);
         // Make sure the flushed memtable is not kept in memory
         int max_memtable_in_history =
-            std::max(options.max_write_buffer_number,
-                     options.max_write_buffer_number_to_maintain) +
+            std::max(
+                options.max_write_buffer_number,
+                static_cast<int>(options.max_write_buffer_size_to_maintain) /
+                    static_cast<int>(options.write_buffer_size)) +
             1;
         for (int i = 0; i < max_memtable_in_history; i++) {
           db->Put(write_options, Slice("key"), Slice("value"));
@@ -284,7 +290,7 @@ TEST_P(TransactionTest, WaitingTxn) {
         ASSERT_EQ(key, "foo");
         ASSERT_EQ(wait.size(), 1);
         ASSERT_EQ(wait[0], id1);
-        ASSERT_EQ(cf_id, 0);
+        ASSERT_EQ(cf_id, 0U);
       });
 
   get_perf_context()->Reset();
@@ -559,10 +565,10 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
     TransactionID leaf_id =
         dlock_entry[dlock_entry.size() - 1].m_txn_id - offset_root;
 
-    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); it++) {
+    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
       auto dl_node = *it;
       ASSERT_EQ(dl_node.m_txn_id, offset_root + leaf_id);
-      ASSERT_EQ(dl_node.m_cf_id, 0);
+      ASSERT_EQ(dl_node.m_cf_id, 0U);
       ASSERT_EQ(dl_node.m_waiting_key, ToString(curr_waiting_key));
       ASSERT_EQ(dl_node.m_exclusive, true);
 
@@ -766,10 +772,10 @@ TEST_P(TransactionStressTest, DeadlockCycle) {
     }
 
     // Iterates backwards over path verifying decreasing txn_ids.
-    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); it++) {
+    for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); ++it) {
       auto dl_node = *it;
       ASSERT_EQ(dl_node.m_txn_id, len + curr_txn_id - 1);
-      ASSERT_EQ(dl_node.m_cf_id, 0);
+      ASSERT_EQ(dl_node.m_cf_id, 0u);
       ASSERT_EQ(dl_node.m_waiting_key, ToString(curr_waiting_key));
       ASSERT_EQ(dl_node.m_exclusive, true);
 
@@ -1723,7 +1729,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
   // our log should be in the heap
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
             txn1->GetLogNumber());
-  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
 
   // flush default cf to crate new log
   s = db->Put(wopts, "foo", "bar");
@@ -1732,12 +1738,12 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
   ASSERT_OK(s);
 
   // make sure we are on a new log
-  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLastLogNumber());
 
   // put txn2 prep section in this log
   s = txn2->Prepare();
   ASSERT_OK(s);
-  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber());
+  ASSERT_EQ(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
 
   // heap should still see first log
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
@@ -1773,7 +1779,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
   ASSERT_OK(s);
 
   // make sure we are on a new log
-  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber());
+  ASSERT_GT(db_impl->TEST_LogfileNumber(), txn2->GetLastLogNumber());
 
   // commit txn2
   s = txn2->Commit();
@@ -1874,7 +1880,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
   s = db->Put(wopts, "cats", "dogs1");
   ASSERT_OK(s);
 
-  auto prepare_log_no = txn1->GetLogNumber();
+  auto prepare_log_no = txn1->GetLastLogNumber();
 
   // roll to LOG B
   s = db_impl->TEST_FlushMemTable(true);
@@ -1901,7 +1907,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
       assert(false);
   }
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
-            prepare_log_no);
+            txn1->GetLogNumber());
   ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
 
   // commit in LOG B
@@ -2600,10 +2606,8 @@ TEST_P(TransactionTest, ColumnFamiliesTest) {
 
   std::vector<ColumnFamilyHandle*> handles;
 
-  s = TransactionDB::Open(options, txn_db_options, dbname, column_families,
-                          &handles, &db);
+  ASSERT_OK(ReOpenNoDelete(column_families, &handles));
   assert(db != nullptr);
-  ASSERT_OK(s);
 
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -2765,10 +2769,8 @@ TEST_P(TransactionTest, MultiGetBatchedTest) {
   std::vector<ColumnFamilyHandle*> handles;
 
   options.merge_operator = MergeOperators::CreateStringAppendOperator();
-  s = TransactionDB::Open(options, txn_db_options, dbname, column_families,
-                          &handles, &db);
+  ASSERT_OK(ReOpenNoDelete(column_families, &handles));
   assert(db != nullptr);
-  ASSERT_OK(s);
 
   // Write some data to the db
   WriteBatch batch;
@@ -3128,6 +3130,12 @@ TEST_P(TransactionTest, LostUpdate) {
 }
 
 TEST_P(TransactionTest, UntrackedWrites) {
+  if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+    // TODO(lth): For WriteUnprepared, validate that untracked writes are
+    // not supported.
+    return;
+  }
+
   WriteOptions write_options;
   ReadOptions read_options;
   std::string value;
@@ -3372,7 +3380,7 @@ TEST_P(TransactionTest, LockLimitTest) {
 
   // Open DB with a lock limit of 3
   txn_db_options.max_num_locks = 3;
-  s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  ASSERT_OK(ReOpen());
   assert(db != nullptr);
   ASSERT_OK(s);
 
@@ -3467,6 +3475,12 @@ TEST_P(TransactionTest, LockLimitTest) {
 }
 
 TEST_P(TransactionTest, IteratorTest) {
+  // This test does writes without snapshot validation, and then tries to create
+  // iterator later, which is unsupported in write unprepared.
+  if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+    return;
+  }
+
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   std::string value;
@@ -3585,6 +3599,16 @@ TEST_P(TransactionTest, IteratorTest) {
 }
 
 TEST_P(TransactionTest, DisableIndexingTest) {
+  // Skip this test for write unprepared. It does not solely rely on WBWI for
+  // read your own writes, so depending on whether batches are flushed or not,
+  // only some writes will be visible.
+  //
+  // Also, write unprepared does not support creating iterators if there has
+  // been txn->Put() without snapshot validation.
+  if (txn_db_options.write_policy == WRITE_UNPREPARED) {
+    return;
+  }
+
   WriteOptions write_options;
   ReadOptions read_options;
   std::string value;
@@ -4010,6 +4034,58 @@ TEST_P(TransactionTest, SavepointTest3) {
   ASSERT_TRUE(s.IsNotFound());
 }
 
+TEST_P(TransactionTest, SavepointTest4) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  Status s;
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  txn1->SetSavePoint();  // 1
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 2
+  s = txn1->Put("B", "b");
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();  // Remove 2
+  ASSERT_OK(s);
+
+  // Verify that A/B still exists.
+  std::string value;
+  ASSERT_OK(txn1->Get(read_options, "A", &value));
+  ASSERT_EQ("a", value);
+
+  ASSERT_OK(txn1->Get(read_options, "B", &value));
+  ASSERT_EQ("b", value);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Rollback to 1
+
+  // Verify that everything was rolled back.
+  s = txn1->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Nothing should be locked
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "");
+  ASSERT_OK(s);
+
+  s = txn2->Put("B", "");
+  ASSERT_OK(s);
+
+  delete txn2;
+  delete txn1;
+}
+
 TEST_P(TransactionTest, UndoGetForUpdateTest) {
   WriteOptions write_options;
   ReadOptions read_options;
@@ -5088,6 +5164,9 @@ Status TransactionStressTestInserter(
   WriteOptions write_options;
   ReadOptions read_options;
   TransactionOptions txn_options;
+  if (rand->OneIn(2)) {
+    txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
+  }
   // Inside the inserter we might also retake the snapshot. We do both since two
   // separte functions are engaged for each.
   txn_options.set_snapshot = rand->OneIn(2);
@@ -5210,6 +5289,9 @@ TEST_P(TransactionTest, MemoryLimitTest) {
   TransactionOptions txn_options;
   // Header (12 bytes) + NOOP (1 byte) + 2 * 8 bytes for data.
   txn_options.max_write_batch_size = 29;
+  // Set threshold to unlimited so that the write batch does not get flushed,
+  // and can hit the memory limit.
+  txn_options.write_batch_flush_threshold = 0;
   std::string value;
   Status s;
 
@@ -5228,16 +5310,8 @@ TEST_P(TransactionTest, MemoryLimitTest) {
   ASSERT_EQ(2, txn->GetNumPuts());
 
   s = txn->Put(Slice("b"), Slice("...."));
-  auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
-  // For write unprepared, write batches exceeding max_write_batch_size will
-  // just flush to DB instead of returning a memory limit error.
-  if (pdb->GetTxnDBOptions().write_policy != WRITE_UNPREPARED) {
-    ASSERT_TRUE(s.IsMemoryLimit());
-    ASSERT_EQ(2, txn->GetNumPuts());
-  } else {
-    ASSERT_OK(s);
-    ASSERT_EQ(3, txn->GetNumPuts());
-  }
+  ASSERT_TRUE(s.IsMemoryLimit());
+  ASSERT_EQ(2, txn->GetNumPuts());
 
   txn->Rollback();
   delete txn;
@@ -5416,6 +5490,7 @@ class ThreeBytewiseComparator : public Comparator {
   }
 };
 
+#ifndef ROCKSDB_VALGRIND_RUN
 TEST_P(TransactionTest, GetWithoutSnapshot) {
   WriteOptions write_options;
   std::atomic<bool> finish = {false};
@@ -5444,6 +5519,7 @@ TEST_P(TransactionTest, GetWithoutSnapshot) {
   commit_thread.join();
   read_thread.join();
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 // Test that the transactional db can handle duplicate keys in the write batch
 TEST_P(TransactionTest, DuplicateKeys) {
@@ -5646,7 +5722,7 @@ TEST_P(TransactionTest, DuplicateKeys) {
     }    // do_rollback
   }      // do_prepare
 
-  {
+  if (!options.unordered_write) {
     // Also test with max_successive_merges > 0. max_successive_merges will not
     // affect our algorithm for duplicate key insertion but we add the test to
     // verify that.
@@ -5697,6 +5773,7 @@ TEST_P(TransactionTest, DuplicateKeys) {
 
     std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
     cf_options.comparator = comp_gc.get();
+    cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
     ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
     delete cf_handle;
     std::vector<ColumnFamilyDescriptor> cfds{
@@ -5892,6 +5969,55 @@ TEST_P(TransactionTest, DuplicateKeys) {
   }
 }
 
+// Test that the reseek optimization in iterators will not result in an infinite
+// loop if there are too many uncommitted entries before the snapshot.
+TEST_P(TransactionTest, ReseekOptimization) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = false;
+  ColumnFamilyDescriptor cfd;
+  db->DefaultColumnFamily()->GetDescriptor(&cfd);
+  auto max_skip = cfd.options.max_sequential_skip_in_iterations;
+
+  ASSERT_OK(db->Put(write_options, Slice("foo0"), Slice("initv")));
+
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid"));
+  // Duplicate keys will result into separate sequence numbers in WritePrepared
+  // and WriteUnPrepared
+  for (size_t i = 0; i < 2 * max_skip; i++) {
+    ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar")));
+  }
+  ASSERT_OK(txn0->Prepare());
+  ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("initv")));
+
+  ReadOptions read_options;
+  // To avoid loops
+  read_options.max_skippable_internal_keys = 10 * max_skip;
+  Iterator* iter = db->NewIterator(read_options);
+  ASSERT_OK(iter->status());
+  size_t cnt = 0;
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    iter->Next();
+    ASSERT_OK(iter->status());
+    cnt++;
+  }
+  ASSERT_EQ(cnt, 2);
+  cnt = 0;
+  iter->SeekToLast();
+  while (iter->Valid()) {
+    iter->Prev();
+    ASSERT_OK(iter->status());
+    cnt++;
+  }
+  ASSERT_EQ(cnt, 2);
+  delete iter;
+  txn0->Rollback();
+  delete txn0;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 33b2c51ea2f..62e03328249 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -5,32 +5,29 @@
 
 #pragma once
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
 #include <algorithm>
+#include <cinttypes>
 #include <functional>
 #include <string>
 #include <thread>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/transaction_test_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
 
 #include "port/port.h"
 
@@ -39,6 +36,8 @@ namespace rocksdb {
 // Return true if the ith bit is set in combination represented by comb
 bool IsInCombination(size_t i, size_t comb) { return comb & (size_t(1) << i); }
 
+enum WriteOrdering : bool { kOrderedWrite, kUnorderedWrite };
+
 class TransactionTestBase : public ::testing::Test {
  public:
   TransactionDB* db;
@@ -50,11 +49,13 @@ class TransactionTestBase : public ::testing::Test {
   bool use_stackable_db_;
 
   TransactionTestBase(bool use_stackable_db, bool two_write_queue,
-                      TxnDBWritePolicy write_policy)
+                      TxnDBWritePolicy write_policy,
+                      WriteOrdering write_ordering)
       : db(nullptr), env(nullptr), use_stackable_db_(use_stackable_db) {
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
     options.write_buffer_size = 4 * 1024;
+    options.unordered_write = write_ordering == kUnorderedWrite;
     options.level0_file_num_compaction_trigger = 2;
     options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
     env = new FaultInjectionTestEnv(Env::Default());
@@ -67,6 +68,12 @@ class TransactionTestBase : public ::testing::Test {
     txn_db_options.default_lock_timeout = 0;
     txn_db_options.write_policy = write_policy;
     txn_db_options.rollback_merge_operands = true;
+    // This will stress write unprepared, by forcing write batch flush on every
+    // write.
+    txn_db_options.default_write_batch_flush_threshold = 1;
+    // Write unprepared requires all transactions to be named. This setting
+    // autogenerates the name so that existing tests can pass.
+    txn_db_options.autogenerate_name = true;
     Status s;
     if (use_stackable_db == false) {
       s = TransactionDB::Open(options, txn_db_options, dbname, &db);
@@ -163,8 +170,6 @@ class TransactionTestBase : public ::testing::Test {
     }
     if (!s.ok()) {
       delete stackable_db;
-      // just in case it was not deleted (and not set to nullptr).
-      delete root_db;
     }
     return s;
   }
@@ -200,8 +205,6 @@ class TransactionTestBase : public ::testing::Test {
     delete handles[0];
     if (!s.ok()) {
       delete stackable_db;
-      // just in case it was not deleted (and not set to nullptr).
-      delete root_db;
     }
     return s;
   }
@@ -210,6 +213,8 @@ class TransactionTestBase : public ::testing::Test {
   std::atomic<size_t> exp_seq = {0};
   std::atomic<size_t> commit_writes = {0};
   std::atomic<size_t> expected_commits = {0};
+  // Without Prepare, the commit does not write to WAL
+  std::atomic<size_t> with_empty_commits = {0};
   std::function<void(size_t, Status)> txn_t0_with_status = [&](size_t index,
                                                                Status exp_s) {
     // Test DB's internal txn. It involves no prepare phase nor a commit marker.
@@ -227,6 +232,7 @@ class TransactionTestBase : public ::testing::Test {
         exp_seq++;
       }
     }
+    with_empty_commits++;
   };
   std::function<void(size_t)> txn_t0 = [&](size_t index) {
     return txn_t0_with_status(index, Status::OK());
@@ -253,6 +259,7 @@ class TransactionTestBase : public ::testing::Test {
       }
     }
     ASSERT_OK(s);
+    with_empty_commits++;
   };
   std::function<void(size_t)> txn_t2 = [&](size_t index) {
     // Commit without prepare. It should write to DB without a commit marker.
@@ -269,15 +276,23 @@ class TransactionTestBase : public ::testing::Test {
     if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
       // Consume one seq per key
       exp_seq += 4;
-    } else {
+    } else if (txn_db_options.write_policy ==
+               TxnDBWritePolicy::WRITE_PREPARED) {
       // Consume one seq per batch
       exp_seq++;
       if (options.two_write_queues) {
         // Consume one seq for commit
         exp_seq++;
       }
+    } else {
+      // Flushed after each key, consume one seq per flushed batch
+      exp_seq += 4;
+      // WriteUnprepared implements CommitWithoutPrepareInternal by simply
+      // calling Prepare then Commit. Consume one seq for the prepare.
+      exp_seq++;
     }
     delete txn;
+    with_empty_commits++;
   };
   std::function<void(size_t)> txn_t3 = [&](size_t index) {
     // A full 2pc txn that also involves a commit marker.
@@ -298,11 +313,17 @@ class TransactionTestBase : public ::testing::Test {
     if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
       // Consume one seq per key
       exp_seq += 5;
-    } else {
+    } else if (txn_db_options.write_policy ==
+               TxnDBWritePolicy::WRITE_PREPARED) {
       // Consume one seq per batch
       exp_seq++;
       // Consume one seq per commit marker
       exp_seq++;
+    } else {
+      // Flushed after each key, consume one seq per flushed batch
+      exp_seq += 5;
+      // Consume one seq per commit marker
+      exp_seq++;
     }
     delete txn;
   };
@@ -325,7 +346,8 @@ class TransactionTestBase : public ::testing::Test {
     if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
       // No seq is consumed for deleting the txn buffer
       exp_seq += 0;
-    } else {
+    } else if (txn_db_options.write_policy ==
+               TxnDBWritePolicy::WRITE_PREPARED) {
       // Consume one seq per batch
       exp_seq++;
       // Consume one seq per rollback batch
@@ -334,6 +356,15 @@ class TransactionTestBase : public ::testing::Test {
         // Consume one seq for rollback commit
         exp_seq++;
       }
+    } else {
+      // Flushed after each key, consume one seq per flushed batch
+      exp_seq += 5;
+      // Consume one seq per rollback batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for rollback commit
+        exp_seq++;
+      }
     }
     delete txn;
   };
@@ -352,6 +383,9 @@ class TransactionTestBase : public ::testing::Test {
     Transaction* txn;
 
     txn_db_options.write_policy = from_policy;
+    if (txn_db_options.write_policy == WRITE_COMMITTED) {
+      options.unordered_write = false;
+    }
     ReOpen();
 
     for (int i = 0; i < 1024; i++) {
@@ -400,6 +434,9 @@ class TransactionTestBase : public ::testing::Test {
     }  // for i
 
     txn_db_options.write_policy = to_policy;
+    if (txn_db_options.write_policy == WRITE_COMMITTED) {
+      options.unordered_write = false;
+    }
     auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     // Before upgrade/downgrade the WAL must be emptied
     if (empty_wal) {
@@ -437,13 +474,14 @@ class TransactionTestBase : public ::testing::Test {
   }
 };
 
-class TransactionTest : public TransactionTestBase,
-                        virtual public ::testing::WithParamInterface<
-                            std::tuple<bool, bool, TxnDBWritePolicy>> {
+class TransactionTest
+    : public TransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
  public:
   TransactionTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam())){};
+                            std::get<2>(GetParam()), std::get<3>(GetParam())){};
 };
 
 class TransactionStressTest : public TransactionTest {};
@@ -451,12 +489,12 @@ class TransactionStressTest : public TransactionTest {};
 class MySQLStyleTransactionTest
     : public TransactionTestBase,
       virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, bool>> {
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, bool>> {
  public:
   MySQLStyleTransactionTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam())),
-        with_slow_threads_(std::get<3>(GetParam())) {
+                            std::get<2>(GetParam()), std::get<3>(GetParam())),
+        with_slow_threads_(std::get<4>(GetParam())) {
     if (with_slow_threads_ &&
         (txn_db_options.write_policy == WRITE_PREPARED ||
          txn_db_options.write_policy == WRITE_UNPREPARED)) {
@@ -466,6 +504,7 @@ class MySQLStyleTransactionTest
       // structures.
       txn_db_options.wp_snapshot_cache_bits = 1;
       txn_db_options.wp_commit_cache_bits = 10;
+      options.write_buffer_size = 1024;
       EXPECT_OK(ReOpen());
     }
   };
diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc
index ec6f7e60ae2..5e3c27f09c3 100644
--- a/utilities/transactions/transaction_util.cc
+++ b/utilities/transactions/transaction_util.cc
@@ -5,17 +5,13 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_util.h"
 
-#include <inttypes.h>
+#include <cinttypes>
 #include <string>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "util/string_util.h"
@@ -56,6 +52,12 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
                                  const std::string& key, bool cache_only,
                                  ReadCallback* snap_checker,
                                  SequenceNumber min_uncommitted) {
+  // When `min_uncommitted` is provided, keys are not always committed
+  // in sequence number order, and `snap_checker` is used to check whether
+  // specific sequence number is in the database is visible to the transaction.
+  // So `snap_checker` must be provided.
+  assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr);
+
   Status result;
   bool need_to_read_sst = false;
 
@@ -73,8 +75,8 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
 
     if (cache_only) {
       result = Status::TryAgain(
-          "Transaction ould not check for conflicts as the MemTable does not "
-          "countain a long enough history to check write at SequenceNumber: ",
+          "Transaction could not check for conflicts as the MemTable does not "
+          "contain a long enough history to check write at SequenceNumber: ",
           ToString(snap_seq));
     }
   } else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) {
@@ -92,7 +94,7 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
                " as the MemTable only contains changes newer than "
                "SequenceNumber %" PRIu64
                ".  Increasing the value of the "
-               "max_write_buffer_number_to_maintain option could reduce the "
+               "max_write_buffer_size_to_maintain option could reduce the "
                "frequency "
                "of this error.",
                snap_seq, earliest_seq);
@@ -104,8 +106,19 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
     SequenceNumber seq = kMaxSequenceNumber;
     bool found_record_for_key = false;
 
+    // When min_uncommitted == kMaxSequenceNumber, writes are committed in
+    // sequence number order, so only keys larger than `snap_seq` can cause
+    // conflict.
+    // When min_uncommitted != kMaxSequenceNumber, keys lower than
+    // min_uncommitted will not triggered conflicts, while keys larger than
+    // min_uncommitted might create conflicts, so we need  to read them out
+    // from the DB, and call callback to snap_checker to determine. So only
+    // keys lower than min_uncommitted can be skipped.
+    SequenceNumber lower_bound_seq =
+        (min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted;
     Status s = db_impl->GetLatestSequenceForKey(sv, key, !need_to_read_sst,
-                                                &seq, &found_record_for_key);
+                                                lower_bound_seq, &seq,
+                                                &found_record_for_key);
 
     if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
       result = s;
diff --git a/utilities/transactions/transaction_util.h b/utilities/transactions/transaction_util.h
index 0fe0e87d862..b1f9f24cb4e 100644
--- a/utilities/transactions/transaction_util.h
+++ b/utilities/transactions/transaction_util.h
@@ -31,6 +31,14 @@ struct TransactionKeyMapInfo {
 
   explicit TransactionKeyMapInfo(SequenceNumber seq_no)
       : seq(seq_no), num_writes(0), num_reads(0), exclusive(false) {}
+
+  // Used in PopSavePoint to collapse two savepoints together.
+  void Merge(const TransactionKeyMapInfo& info) {
+    assert(seq <= info.seq);
+    num_reads += info.num_reads;
+    num_writes += info.num_writes;
+    exclusive |= info.exclusive;
+  }
 };
 
 using TransactionKeyMap =
@@ -50,6 +58,9 @@ class TransactionUtil {
   // SST files.  This will make it more likely this function will
   // return an error if it is unable to determine if there are any conflicts.
   //
+  // See comment of CheckKey() for explanation of `snap_seq`, `snap_checker`
+  // and `min_uncommitted`.
+  //
   // Returns OK on success, BUSY if there is a conflicting write, or other error
   // status for any unexpected errors.
   static Status CheckKeyForConflicts(
@@ -72,6 +83,14 @@ class TransactionUtil {
                                       bool cache_only);
 
  private:
+  // If `snap_checker` == nullptr, writes are always commited in sequence number
+  // order. All sequence number <= `snap_seq` will not conflict with any
+  // write, and all keys > `snap_seq` of `key` will trigger conflict.
+  // If `snap_checker` != nullptr, writes may not commit in sequence number
+  // order. In this case `min_uncommitted` is a lower bound.
+  //  seq < `min_uncommitted`: no conflict
+  //  seq > `snap_seq`: applicable to conflict
+  //  `min_uncommitted` <= seq <= `snap_seq`: call `snap_checker` to determine.
   static Status CheckKey(DBImpl* db_impl, SuperVersion* sv,
                          SequenceNumber earliest_seq, SequenceNumber snap_seq,
                          const std::string& key, bool cache_only,
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index f2f3f30e26e..4efdcaa0a50 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -5,20 +5,16 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_test.h"
 
-#include <inttypes.h>
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <functional>
 #include <string>
 #include <thread>
 
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
@@ -27,14 +23,14 @@
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
-#include "util/fault_injection_test_env.h"
+#include "test_util/fault_injection_test_env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "test_util/transaction_test_util.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/transaction_test_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
@@ -52,32 +48,27 @@ using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat;
 
 TEST(PreparedHeap, BasicsTest) {
   WritePreparedTxnDB::PreparedHeap heap;
-  heap.push(14l);
-  // Test with one element
-  ASSERT_EQ(14l, heap.top());
-  heap.push(24l);
-  heap.push(34l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(14l);
+    // Test with one element
+    ASSERT_EQ(14l, heap.top());
+    heap.push(24l);
+    heap.push(34l);
+    // Test that old min is still on top
+    ASSERT_EQ(14l, heap.top());
+    heap.push(44l);
+    heap.push(54l);
+    heap.push(64l);
+    heap.push(74l);
+    heap.push(84l);
+  }
   // Test that old min is still on top
   ASSERT_EQ(14l, heap.top());
-  heap.push(13l);
-  // Test that the new min will be on top
-  ASSERT_EQ(13l, heap.top());
-  // Test that it is persistent
-  ASSERT_EQ(13l, heap.top());
-  heap.push(44l);
-  heap.push(54l);
-  heap.push(64l);
-  heap.push(74l);
-  heap.push(84l);
-  // Test that old min is still on top
-  ASSERT_EQ(13l, heap.top());
   heap.erase(24l);
   // Test that old min is still on top
-  ASSERT_EQ(13l, heap.top());
+  ASSERT_EQ(14l, heap.top());
   heap.erase(14l);
-  // Test that old min is still on top
-  ASSERT_EQ(13l, heap.top());
-  heap.erase(13l);
   // Test that the new comes to the top after multiple erase
   ASSERT_EQ(34l, heap.top());
   heap.erase(34l);
@@ -93,11 +84,14 @@ TEST(PreparedHeap, BasicsTest) {
   ASSERT_EQ(64l, heap.top());
   heap.erase(84l);
   ASSERT_EQ(64l, heap.top());
-  heap.push(85l);
-  heap.push(86l);
-  heap.push(87l);
-  heap.push(88l);
-  heap.push(89l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(85l);
+    heap.push(86l);
+    heap.push(87l);
+    heap.push(88l);
+    heap.push(89l);
+  }
   heap.erase(87l);
   heap.erase(85l);
   heap.erase(89l);
@@ -118,13 +112,19 @@ TEST(PreparedHeap, BasicsTest) {
 // not resurface again.
 TEST(PreparedHeap, EmptyAtTheEnd) {
   WritePreparedTxnDB::PreparedHeap heap;
-  heap.push(40l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(40l);
+  }
   ASSERT_EQ(40l, heap.top());
   // Although not a recommended scenario, we must be resilient against erase
   // without a prior push.
   heap.erase(50l);
   ASSERT_EQ(40l, heap.top());
-  heap.push(60l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(60l);
+  }
   ASSERT_EQ(40l, heap.top());
 
   heap.erase(60l);
@@ -132,11 +132,17 @@ TEST(PreparedHeap, EmptyAtTheEnd) {
   heap.erase(40l);
   ASSERT_TRUE(heap.empty());
 
-  heap.push(40l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(40l);
+  }
   ASSERT_EQ(40l, heap.top());
   heap.erase(50l);
   ASSERT_EQ(40l, heap.top());
-  heap.push(60l);
+  {
+    MutexLock ml(heap.push_pop_mutex());
+    heap.push(60l);
+  }
   ASSERT_EQ(40l, heap.top());
 
   heap.erase(40l);
@@ -151,30 +157,37 @@ TEST(PreparedHeap, EmptyAtTheEnd) {
 // successfully emptied at the end.
 TEST(PreparedHeap, Concurrent) {
   const size_t t_cnt = 10;
-  rocksdb::port::Thread t[t_cnt];
-  Random rnd(1103);
+  rocksdb::port::Thread t[t_cnt + 1];
   WritePreparedTxnDB::PreparedHeap heap;
   port::RWMutex prepared_mutex;
+  std::atomic<size_t> last;
 
   for (size_t n = 0; n < 100; n++) {
-    for (size_t i = 0; i < t_cnt; i++) {
-      // This is not recommended usage but we should be resilient against it.
-      bool skip_push = rnd.OneIn(5);
-      t[i] = rocksdb::port::Thread([&heap, &prepared_mutex, skip_push, i]() {
-        auto seq = i;
-        std::this_thread::yield();
+    last = 0;
+    t[0] = rocksdb::port::Thread([&]() {
+      Random rnd(1103);
+      for (size_t seq = 1; seq <= t_cnt; seq++) {
+        // This is not recommended usage but we should be resilient against it.
+        bool skip_push = rnd.OneIn(5);
         if (!skip_push) {
-          WriteLock wl(&prepared_mutex);
+          MutexLock ml(heap.push_pop_mutex());
+          std::this_thread::yield();
           heap.push(seq);
+          last.store(seq);
         }
-        std::this_thread::yield();
-        {
-          WriteLock wl(&prepared_mutex);
-          heap.erase(seq);
-        }
+      }
+    });
+    for (size_t i = 1; i <= t_cnt; i++) {
+      t[i] = rocksdb::port::Thread([&heap, &prepared_mutex, &last, i]() {
+        auto seq = i;
+        do {
+          std::this_thread::yield();
+        } while (last.load() < seq);
+        WriteLock wl(&prepared_mutex);
+        heap.erase(seq);
       });
     }
-    for (size_t i = 0; i < t_cnt; i++) {
+    for (size_t i = 0; i <= t_cnt; i++) {
       t[i].join();
     }
     ASSERT_TRUE(heap.empty());
@@ -342,8 +355,10 @@ class WritePreparedTxnDBMock : public WritePreparedTxnDB {
 class WritePreparedTransactionTestBase : public TransactionTestBase {
  public:
   WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue,
-                                   TxnDBWritePolicy write_policy)
-      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy){};
+                                   TxnDBWritePolicy write_policy,
+                                   WriteOrdering write_ordering)
+      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
+                            write_ordering){};
 
  protected:
   void UpdateTransactionDBOptions(size_t snapshot_cache_bits,
@@ -518,26 +533,26 @@ class WritePreparedTransactionTestBase : public TransactionTestBase {
 class WritePreparedTransactionTest
     : public WritePreparedTransactionTestBase,
       virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy>> {
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
  public:
   WritePreparedTransactionTest()
-      : WritePreparedTransactionTestBase(std::get<0>(GetParam()),
-                                         std::get<1>(GetParam()),
-                                         std::get<2>(GetParam())){};
+      : WritePreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam())){};
 };
 
 #ifndef ROCKSDB_VALGRIND_RUN
 class SnapshotConcurrentAccessTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, size_t, size_t>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
  public:
   SnapshotConcurrentAccessTest()
-      : WritePreparedTransactionTestBase(std::get<0>(GetParam()),
-                                         std::get<1>(GetParam()),
-                                         std::get<2>(GetParam())),
-        split_id_(std::get<3>(GetParam())),
-        split_cnt_(std::get<4>(GetParam())){};
+      : WritePreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam())),
+        split_id_(std::get<4>(GetParam())),
+        split_cnt_(std::get<5>(GetParam())){};
 
  protected:
   // A test is split into split_cnt_ tests, each identified with split_id_ where
@@ -549,15 +564,15 @@ class SnapshotConcurrentAccessTest
 
 class SeqAdvanceConcurrentTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, size_t, size_t>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
  public:
   SeqAdvanceConcurrentTest()
-      : WritePreparedTransactionTestBase(std::get<0>(GetParam()),
-                                         std::get<1>(GetParam()),
-                                         std::get<2>(GetParam())),
-        split_id_(std::get<3>(GetParam())),
-        split_cnt_(std::get<4>(GetParam())){};
+      : WritePreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam())),
+        split_id_(std::get<4>(GetParam())),
+        split_cnt_(std::get<5>(GetParam())){};
 
  protected:
   // A test is split into split_cnt_ tests, each identified with split_id_ where
@@ -568,81 +583,118 @@ class SeqAdvanceConcurrentTest
 
 INSTANTIATE_TEST_CASE_P(
     WritePreparedTransactionTest, WritePreparedTransactionTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED),
-                      std::make_tuple(false, true, WRITE_PREPARED)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)));
 
 #ifndef ROCKSDB_VALGRIND_RUN
 INSTANTIATE_TEST_CASE_P(
     TwoWriteQueues, SnapshotConcurrentAccessTest,
-    ::testing::Values(std::make_tuple(false, true, WRITE_PREPARED, 0, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 1, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 2, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 3, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 4, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 5, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 6, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 7, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 8, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 9, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 10, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 11, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 12, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 13, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 14, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 15, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 16, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 17, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 18, 20),
-                      std::make_tuple(false, true, WRITE_PREPARED, 19, 20)));
+    ::testing::Values(
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20),
+
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20)));
 
 INSTANTIATE_TEST_CASE_P(
     OneWriteQueue, SnapshotConcurrentAccessTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED, 0, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 1, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 2, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 3, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 4, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 5, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 6, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 7, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 8, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 9, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 10, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 11, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 12, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 13, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 14, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 15, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 16, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 17, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 18, 20),
-                      std::make_tuple(false, false, WRITE_PREPARED, 19, 20)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20)));
 
 INSTANTIATE_TEST_CASE_P(
     TwoWriteQueues, SeqAdvanceConcurrentTest,
-    ::testing::Values(std::make_tuple(false, true, WRITE_PREPARED, 0, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 1, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 2, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 3, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 4, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 5, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 6, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 7, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 8, 10),
-                      std::make_tuple(false, true, WRITE_PREPARED, 9, 10)));
+    ::testing::Values(
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10),
+        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10)));
 
 INSTANTIATE_TEST_CASE_P(
     OneWriteQueue, SeqAdvanceConcurrentTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED, 0, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 1, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 2, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 3, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 4, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 5, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 6, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 7, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 8, 10),
-                      std::make_tuple(false, false, WRITE_PREPARED, 9, 10)));
+    ::testing::Values(
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
+        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(WritePreparedTransactionTest, CommitMapTest) {
@@ -734,6 +786,147 @@ TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) {
   MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
 }
 
+// Trigger the condition where some old memtables are skipped when doing
+// TransactionUtil::CheckKey(), and make sure the result is still correct.
+TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
+  const int kAttemptHistoryMemtable = 0;
+  const int kAttemptImmMemTable = 1;
+  for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
+       attempt++) {
+    options.max_write_buffer_number_to_maintain = 3;
+    ReOpen();
+
+    WriteOptions write_options;
+    ReadOptions read_options;
+    TransactionOptions txn_options;
+    txn_options.set_snapshot = true;
+    string value;
+    Status s;
+
+    ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+    ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
+
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn != nullptr);
+    ASSERT_OK(txn->SetName("txn"));
+
+    Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn2 != nullptr);
+    ASSERT_OK(txn2->SetName("txn2"));
+
+    // This transaction is created to cause potential conflict.
+    Transaction* txn_x = db->BeginTransaction(write_options);
+    ASSERT_OK(txn_x->SetName("txn_x"));
+    ASSERT_OK(txn_x->Put(Slice("foo"), Slice("bar3")));
+    ASSERT_OK(txn_x->Prepare());
+
+    // Create snapshots after the prepare, but there should still
+    // be a conflict when trying to read "foo".
+
+    if (attempt == kAttemptImmMemTable) {
+      // For the second attempt, hold flush from beginning. The memtable
+      // will be switched to immutable after calling TEST_SwitchMemtable()
+      // while CheckKey() is called.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"WritePreparedTransactionTest.CheckKeySkipOldMemtable",
+            "FlushJob::Start"}});
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    }
+
+    // force a memtable flush. The memtable should still be kept
+    FlushOptions flush_ops;
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_OK(db->Flush(flush_ops));
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      DBImpl* db_impl = static_cast<DBImpl*>(db->GetRootDB());
+      db_impl->TEST_SwitchMemtable();
+    }
+    uint64_t num_imm_mems;
+    ASSERT_TRUE(db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
+                                   &num_imm_mems));
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_EQ(0, num_imm_mems);
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(1, num_imm_mems);
+    }
+
+    // Put something in active memtable
+    ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar")));
+
+    // Create txn3 after flushing, but this transaction also needs to
+    // check all memtables because of they contains uncommitted data.
+    Transaction* txn3 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn3 != nullptr);
+    ASSERT_OK(txn3->SetName("txn3"));
+
+    // Commit the pending write
+    ASSERT_OK(txn_x->Commit());
+
+    // Commit txn, txn2 and tx3. txn and tx3 will conflict but txn2 will
+    // pass. In all cases, both memtables are queried.
+    SetPerfLevel(PerfLevel::kEnableCount);
+    get_perf_context()->Reset();
+    ASSERT_TRUE(txn3->GetForUpdate(read_options, "foo", &value).IsBusy());
+    // We should have checked two memtables, active and either immutable
+    // or history memtable, depending on the test case.
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+
+    get_perf_context()->Reset();
+    ASSERT_TRUE(txn->GetForUpdate(read_options, "foo", &value).IsBusy());
+    // We should have checked two memtables, active and either immutable
+    // or history memtable, depending on the test case.
+    ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+
+    get_perf_context()->Reset();
+    ASSERT_OK(txn2->GetForUpdate(read_options, "foo2", &value));
+    ASSERT_EQ(value, "bar");
+    // We should have checked two memtables, and since there is no
+    // conflict, another Get() will be made and fetch the data from
+    // DB. If it is in immutable memtable, two extra memtable reads
+    // will be issued. If it is not (in history), only one will
+    // be made, which is to the active memtable.
+    if (attempt == kAttemptHistoryMemtable) {
+      ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
+    } else {
+      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(4, get_perf_context()->get_from_memtable_count);
+    }
+
+    Transaction* txn4 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_TRUE(txn4 != nullptr);
+    ASSERT_OK(txn4->SetName("txn4"));
+    get_perf_context()->Reset();
+    ASSERT_OK(txn4->GetForUpdate(read_options, "foo", &value));
+    if (attempt == kAttemptHistoryMemtable) {
+      // Active memtable will be checked in snapshot validation and when
+      // getting the value.
+      ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
+    } else {
+      // Only active memtable will be checked in snapshot validation but
+      // both of active and immutable snapshot will be queried when
+      // getting the value.
+      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
+    }
+
+    ASSERT_OK(txn2->Commit());
+    ASSERT_OK(txn4->Commit());
+
+    TEST_SYNC_POINT("WritePreparedTransactionTest.CheckKeySkipOldMemtable");
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+    SetPerfLevel(PerfLevel::kDisable);
+
+    delete txn;
+    delete txn2;
+    delete txn3;
+    delete txn4;
+    delete txn_x;
+  }
+}
+
 // Reproduce the bug with two snapshots with the same seuqence number and test
 // that the release of the first snapshot will not affect the reads by the other
 // snapshot
@@ -1026,7 +1219,7 @@ TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccessTest) {
           new_snapshots.push_back(snapshots[old_snapshots.size() + i]);
         }
         for (auto it = common_snapshots.begin(); it != common_snapshots.end();
-             it++) {
+             ++it) {
           auto snapshot = *it;
           // Create a commit entry that is around the snapshot and thus should
           // be not be discarded
@@ -1093,12 +1286,12 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasicTest) {
   // b. delayed prepared should contain every txn <= max and prepared should
   // only contain txns > max
   auto it = initial_prepared.begin();
-  for (; it != initial_prepared.end() && *it <= new_max; it++) {
+  for (; it != initial_prepared.end() && *it <= new_max; ++it) {
     ASSERT_EQ(1, wp_db->delayed_prepared_.erase(*it));
   }
   ASSERT_TRUE(wp_db->delayed_prepared_.empty());
   for (; it != initial_prepared.end() && !wp_db->prepared_txns_.empty();
-       it++, wp_db->prepared_txns_.pop()) {
+       ++it, wp_db->prepared_txns_.pop()) {
     ASSERT_EQ(*it, wp_db->prepared_txns_.top());
   }
   ASSERT_TRUE(it == initial_prepared.end());
@@ -1179,7 +1372,7 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
     for (int i = 0; i < writes; i++) {
       WriteBatch batch;
       // For duplicate keys cause 4 commit entries, each evicting an entry that
-      // is not published yet, thus causing max ecited seq go higher than last
+      // is not published yet, thus causing max evicted seq go higher than last
       // published.
       for (int b = 0; b < batch_cnt; b++) {
         batch.Put("foo", "foo");
@@ -1193,9 +1386,12 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
       std::this_thread::yield();
     }
     for (int i = 0; i < 10; i++) {
+      SequenceNumber max_lower_bound = wp_db->max_evicted_seq_;
       auto snap = db->GetSnapshot();
       if (snap->GetSequenceNumber() != 0) {
-        ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+        // Value of max_evicted_seq_ when snapshot was taken in unknown. We thus
+        // compare with the lower bound instead as an approximation.
+        ASSERT_LT(max_lower_bound, snap->GetSequenceNumber());
       }  // seq 0 is ok to be less than max since nothing is visible to it
       db->ReleaseSnapshot(snap);
     }
@@ -1211,6 +1407,64 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
   db->ReleaseSnapshot(snap);
 }
 
+// Test that reads without snapshots would not hit an undefined state
+TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+  WriteOptions woptions;
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+  const int writes = 50;
+  rocksdb::port::Thread t1([&]() {
+    for (int i = 0; i < writes; i++) {
+      WriteBatch batch;
+      batch.Put("key", "foo");
+      db->Write(woptions, &batch);
+    }
+  });
+
+  rocksdb::port::Thread t2([&]() {
+    while (wp_db->max_evicted_seq_ == 0) {  // wait for insert thread
+      std::this_thread::yield();
+    }
+    ReadOptions ropt;
+    PinnableSlice pinnable_val;
+    TransactionOptions txn_options;
+    for (int i = 0; i < 10; i++) {
+      auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      pinnable_val.Reset();
+      Transaction* txn = db->BeginTransaction(woptions, txn_options);
+      s = txn->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      pinnable_val.Reset();
+      std::vector<std::string> values;
+      auto s_vec =
+          txn->MultiGet(ropt, {db->DefaultColumnFamily()}, {"key"}, &values);
+      ASSERT_EQ(1, values.size());
+      ASSERT_EQ(1, s_vec.size());
+      s = s_vec[0];
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      Slice key("key");
+      txn->MultiGet(ropt, db->DefaultColumnFamily(), 1, &key, &pinnable_val, &s,
+                    true);
+      ASSERT_TRUE(s.ok() || s.IsTryAgain());
+      delete txn;
+    }
+  });
+
+  t1.join();
+  t2.join();
+
+  // Make sure that the test has worked and seq number has advanced as we
+  // thought
+  auto snap = db->GetSnapshot();
+  ASSERT_GT(snap->GetSequenceNumber(), writes - 1);
+  db->ReleaseSnapshot(snap);
+}
+
 // Check that old_commit_map_ cleanup works correctly if the snapshot equals
 // max_evicted_seq_.
 TEST_P(WritePreparedTransactionTest, CleanupSnapshotEqualToMax) {
@@ -1324,6 +1578,70 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicatesTest) {
   delete txn0;
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
+// Stress SmallestUnCommittedSeq, which reads from both prepared_txns_ and
+// delayed_prepared_, when is run concurrently with advancing max_evicted_seq,
+// which moves prepared txns from prepared_txns_ to delayed_prepared_.
+TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 1;    // disable commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  ReadOptions ropt;
+  PinnableSlice pinnable_val;
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  std::vector<Transaction*> txns, committed_txns;
+
+  const int cnt = 100;
+  for (int i = 0; i < cnt; i++) {
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn->SetName("xid" + ToString(i)));
+    auto key = "key1" + ToString(i);
+    auto value = "value1" + ToString(i);
+    ASSERT_OK(txn->Put(Slice(key), Slice(value)));
+    ASSERT_OK(txn->Prepare());
+    txns.push_back(txn);
+  }
+
+  port::Mutex mutex;
+  Random rnd(1103);
+  rocksdb::port::Thread commit_thread([&]() {
+    for (int i = 0; i < cnt; i++) {
+      uint32_t index = rnd.Uniform(cnt - i);
+      Transaction* txn;
+      {
+        MutexLock l(&mutex);
+        txn = txns[index];
+        txns.erase(txns.begin() + index);
+      }
+      // Since commit cache is practically disabled, commit results in immediate
+      // advance in max_evicted_seq_ and subsequently moving some prepared txns
+      // to delayed_prepared_.
+      txn->Commit();
+      committed_txns.push_back(txn);
+    }
+  });
+  rocksdb::port::Thread read_thread([&]() {
+    while (1) {
+      MutexLock l(&mutex);
+      if (txns.empty()) {
+        break;
+      }
+      auto min_uncommitted = wp_db->SmallestUnCommittedSeq();
+      ASSERT_LE(min_uncommitted, (*txns.begin())->GetId());
+    }
+  });
+
+  commit_thread.join();
+  read_thread.join();
+  for (auto txn : committed_txns) {
+    delete txn;
+  }
+}
+#endif  // ROCKSDB_VALGRIND_RUN
+
 TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
   // Given the sequential run of txns, with this timeout we should never see a
   // deadlock nor a timeout unless we have a key conflict, which should be
@@ -1357,6 +1675,7 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
     }
     DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     auto seq = db_impl->TEST_GetLastVisibleSequence();
+    with_empty_commits = 0;
     exp_seq = seq;
     // This is increased before writing the batch for commit
     commit_writes = 0;
@@ -1448,12 +1767,12 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
     assert(db != nullptr);
     db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     seq = db_impl->TEST_GetLastVisibleSequence();
-    ASSERT_LE(exp_seq, seq);
+    ASSERT_LE(exp_seq, seq + with_empty_commits);
 
     // Check if flush preserves the last sequence number
     db_impl->Flush(fopt);
     seq = db_impl->GetLatestSequenceNumber();
-    ASSERT_LE(exp_seq, seq);
+    ASSERT_LE(exp_seq, seq + with_empty_commits);
 
     // Check if recovery after flush preserves the last sequence number
     db_impl->FlushWAL(true);
@@ -1461,7 +1780,7 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
     assert(db != nullptr);
     db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
     seq = db_impl->GetLatestSequenceNumber();
-    ASSERT_LE(exp_seq, seq);
+    ASSERT_LE(exp_seq, seq + with_empty_commits);
   }
 }
 
@@ -2965,13 +3284,16 @@ TEST_P(WritePreparedTransactionTest, AddPreparedBeforeMax) {
   ASSERT_OK(txn->Put(Slice("key0"), uncommitted_value));
   port::Mutex txn_mutex_;
 
-  // t1) Insert prepared entry, t2) commit other entires to advance max
-  // evicted sec and finish checking the existing prepared entires, t1)
+  // t1) Insert prepared entry, t2) commit other entries to advance max
+  // evicted sec and finish checking the existing prepared entries, t1)
   // AddPrepared, t2) update max_evicted_seq_
   rocksdb::SyncPoint::GetInstance()->LoadDependency({
-      {"AddPrepared::begin:pause", "AddPreparedBeforeMax::read_thread:start"},
-      {"AdvanceMaxEvictedSeq::update_max:pause", "AddPrepared::begin:resume"},
-      {"AddPrepared::end", "AdvanceMaxEvictedSeq::update_max:resume"},
+      {"AddPreparedCallback::AddPrepared::begin:pause",
+       "AddPreparedBeforeMax::read_thread:start"},
+      {"AdvanceMaxEvictedSeq::update_max:pause",
+       "AddPreparedCallback::AddPrepared::begin:resume"},
+      {"AddPreparedCallback::AddPrepared::end",
+       "AdvanceMaxEvictedSeq::update_max:resume"},
   });
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -3025,20 +3347,34 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
       ReOpen();
       std::atomic<const Snapshot*> snap = {nullptr};
       std::atomic<SequenceNumber> exp_prepare = {0};
+      rocksdb::port::Thread callback_thread;
       // Value is synchronized via snap
       PinnableSlice value;
       // Take a snapshot after publish and before RemovePrepared:Start
+      auto snap_callback = [&]() {
+        ASSERT_EQ(nullptr, snap.load());
+        snap.store(db->GetSnapshot());
+        ReadOptions roptions;
+        roptions.snapshot = snap.load();
+        auto s = db->Get(roptions, db->DefaultColumnFamily(), "key2", &value);
+        ASSERT_OK(s);
+      };
       auto callback = [&](void* param) {
         SequenceNumber prep_seq = *((SequenceNumber*)param);
         if (prep_seq == exp_prepare.load()) {  // only for write_thread
-          ASSERT_EQ(nullptr, snap.load());
-          snap.store(db->GetSnapshot());
-          ReadOptions roptions;
-          roptions.snapshot = snap.load();
-          auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value);
-          ASSERT_OK(s);
+          // We need to spawn a thread to avoid deadlock since getting a
+          // snpashot might end up calling AdvanceSeqByOne which needs joining
+          // the write queue.
+          callback_thread = rocksdb::port::Thread(snap_callback);
+          TEST_SYNC_POINT("callback:end");
         }
       };
+      // Wait for the first snapshot be taken in GetSnapshotInternal. Although
+      // it might be updated before GetSnapshotInternal finishes but this should
+      // cover most of the cases.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency({
+          {"WritePreparedTxnDB::GetSnapshotInternal:first", "callback:end"},
+      });
       SyncPoint::GetInstance()->SetCallBack("RemovePrepared:Start", callback);
       SyncPoint::GetInstance()->EnableProcessing();
       // Thread to cause frequent evictions
@@ -3056,7 +3392,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
           ASSERT_OK(txn->SetName("xid"));
           std::string val_str = "value" + ToString(i);
           for (size_t b = 0; b < sub_batch_cnt; b++) {
-            ASSERT_OK(txn->Put(Slice("key"), val_str));
+            ASSERT_OK(txn->Put(Slice("key2"), val_str));
           }
           ASSERT_OK(txn->Prepare());
           // Let an eviction to kick in
@@ -3065,13 +3401,17 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
           exp_prepare.store(txn->GetId());
           ASSERT_OK(txn->Commit());
           delete txn;
+          // Wait for the snapshot taking that is triggered by
+          // RemovePrepared:Start callback
+          callback_thread.join();
 
           // Read with the snapshot taken before delayed_prepared_ cleanup
           ReadOptions roptions;
           roptions.snapshot = snap.load();
           ASSERT_NE(nullptr, roptions.snapshot);
           PinnableSlice value2;
-          auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value2);
+          auto s =
+              db->Get(roptions, db->DefaultColumnFamily(), "key2", &value2);
           ASSERT_OK(s);
           // It should see its own write
           ASSERT_TRUE(val_str == value2);
@@ -3084,9 +3424,9 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
       });
       write_thread.join();
       eviction_thread.join();
+      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+      rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
     }
-    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
   }
 }
 
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
index 6c7cb359dc4..f8eef361b82 100644
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -7,16 +7,12 @@
 
 #include "utilities/transactions/write_prepared_txn.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <map>
 #include <set>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/transaction_db.h"
@@ -44,19 +40,44 @@ void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) {
   prepare_batch_cnt_ = 0;
 }
 
+void WritePreparedTxn::MultiGet(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family,
+                                const size_t num_keys, const Slice* keys,
+                                PinnableSlice* values, Status* statuses,
+                                const bool sorted_input) {
+  SequenceNumber min_uncommitted, snap_seq;
+  const SnapshotBackup backed_by_snapshot =
+      wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
+                                        backed_by_snapshot);
+  write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
+                                      keys, values, statuses, sorted_input,
+                                      &callback);
+  if (UNLIKELY(!callback.valid() ||
+               !wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+    wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+    for (size_t i = 0; i < num_keys; i++) {
+      statuses[i] = Status::TryAgain();
+    }
+  }
+}
+
 Status WritePreparedTxn::Get(const ReadOptions& options,
                              ColumnFamilyHandle* column_family,
                              const Slice& key, PinnableSlice* pinnable_val) {
   SequenceNumber min_uncommitted, snap_seq;
-  const bool backed_by_snapshot =
+  const SnapshotBackup backed_by_snapshot =
       wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
-  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted);
+  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
+                                        backed_by_snapshot);
   auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
                                             pinnable_val, &callback);
-  if (LIKELY(wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
+  if (LIKELY(callback.valid() &&
+             wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
                                        backed_by_snapshot))) {
     return res;
   } else {
+    wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
     return Status::TryAgain();
   }
 }
@@ -173,12 +194,15 @@ Status WritePreparedTxn::CommitInternal() {
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   const SequenceNumber commit_batch_seq = seq_used;
   if (LIKELY(do_one_write || !s.ok())) {
-    if (LIKELY(s.ok())) {
-      // Note RemovePrepared should be called after WriteImpl that publishsed
+    if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues &&
+                 s.ok())) {
+      // Note: RemovePrepared should be called after WriteImpl that publishsed
       // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
       wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
-    }
+    }  // else RemovePrepared is called from within PreReleaseCallback
     if (UNLIKELY(!do_one_write)) {
+      assert(!s.ok());
+      // Cleanup the prepared entry we added with add_prepared_callback
       wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
     }
     return s;
@@ -203,10 +227,14 @@ Status WritePreparedTxn::CommitInternal() {
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_aux_batch);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
-  // Note RemovePrepared should be called after WriteImpl that publishsed the
-  // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
-  wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
-  wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+  if (UNLIKELY(!db_impl_->immutable_db_options().two_write_queues)) {
+    if (s.ok()) {
+      // Note: RemovePrepared should be called after WriteImpl that publishsed
+      // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+      wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
+    }
+    wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+  }  // else RemovePrepared is called from within PreReleaseCallback
   return s;
 }
 
@@ -219,9 +247,11 @@ Status WritePreparedTxn::RollbackInternal() {
   auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap();
   auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap();
   auto read_at_seq = kMaxSequenceNumber;
+  ReadOptions roptions;
+  // to prevent callback's seq to be overrriden inside DBImpk::Get
+  roptions.snapshot = wpt_db_->GetMaxSnapshot();
   struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
     DBImpl* db_;
-    ReadOptions roptions;
     WritePreparedTxnReadCallback callback;
     WriteBatch* rollback_batch_;
     std::map<uint32_t, const Comparator*>& comparators_;
@@ -229,18 +259,20 @@ Status WritePreparedTxn::RollbackInternal() {
     using CFKeys = std::set<Slice, SetComparator>;
     std::map<uint32_t, CFKeys> keys_;
     bool rollback_merge_operands_;
+    ReadOptions roptions_;
     RollbackWriteBatchBuilder(
         DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
         WriteBatch* dst_batch,
         std::map<uint32_t, const Comparator*>& comparators,
         std::map<uint32_t, ColumnFamilyHandle*>& handles,
-        bool rollback_merge_operands)
+        bool rollback_merge_operands, ReadOptions _roptions)
         : db_(db),
           callback(wpt_db, snap_seq),  // disable min_uncommitted optimization
           rollback_batch_(dst_batch),
           comparators_(comparators),
           handles_(handles),
-          rollback_merge_operands_(rollback_merge_operands) {}
+          rollback_merge_operands_(rollback_merge_operands),
+          roptions_(_roptions) {}
 
     Status Rollback(uint32_t cf, const Slice& key) {
       Status s;
@@ -258,8 +290,12 @@ Status WritePreparedTxn::RollbackInternal() {
       PinnableSlice pinnable_val;
       bool not_used;
       auto cf_handle = handles_[cf];
-      s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
-                       &callback);
+      DBImpl::GetImplOptions get_impl_options;
+      get_impl_options.column_family = cf_handle;
+      get_impl_options.value = &pinnable_val;
+      get_impl_options.value_found = &not_used;
+      get_impl_options.callback = &callback;
+      s = db_->GetImpl(roptions_, key, get_impl_options);
       assert(s.ok() || s.IsNotFound());
       if (s.ok()) {
         s = rollback_batch_->Put(cf_handle, key, pinnable_val);
@@ -308,7 +344,8 @@ Status WritePreparedTxn::RollbackInternal() {
     bool WriteAfterCommit() const override { return false; }
   } rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch,
                      *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
-                     wpt_db_->txn_db_options_.rollback_merge_operands);
+                     wpt_db_->txn_db_options_.rollback_merge_operands,
+                     roptions);
   auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
   assert(s.ok());
   if (!s.ok()) {
@@ -352,6 +389,7 @@ Status WritePreparedTxn::RollbackInternal() {
     return s;
   }
   if (do_one_write) {
+    assert(!db_impl_->immutable_db_options().two_write_queues);
     wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
     return s;
   }  // else do the 2nd write for commit
@@ -374,9 +412,13 @@ Status WritePreparedTxn::RollbackInternal() {
   ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
                     "RollbackInternal (status=%s) commit: %" PRIu64,
                     s.ToString().c_str(), GetId());
+  // TODO(lth): For WriteUnPrepared that rollback is called frequently,
+  // RemovePrepared could be moved to the callback to reduce lock contention.
   if (s.ok()) {
     wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
   }
+  // Note: RemovePrepared for prepared batch is called from within
+  // PreReleaseCallback
   wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH);
 
   return s;
@@ -407,7 +449,8 @@ Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
   ColumnFamilyHandle* cfh =
       column_family ? column_family : db_impl_->DefaultColumnFamily();
 
-  WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted);
+  WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted,
+                                            kBackedByDBSnapshot);
   return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
                                                snap_seq, false /* cache_only */,
                                                &snap_checker, min_uncommitted);
diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h
index 2cd729cd2c7..aaebaf48afd 100644
--- a/utilities/transactions/write_prepared_txn.h
+++ b/utilities/transactions/write_prepared_txn.h
@@ -42,6 +42,9 @@ class WritePreparedTxn : public PessimisticTransaction {
  public:
   WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options,
                    const TransactionOptions& txn_options);
+  // No copying allowed
+  WritePreparedTxn(const WritePreparedTxn&) = delete;
+  void operator=(const WritePreparedTxn&) = delete;
 
   virtual ~WritePreparedTxn() {}
 
@@ -53,6 +56,13 @@ class WritePreparedTxn : public PessimisticTransaction {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override;
 
+  using Transaction::MultiGet;
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+
   // Note: The behavior is undefined in presence of interleaved writes to the
   // same transaction.
   // To make WAL commit markers visible, the snapshot will be
@@ -99,10 +109,6 @@ class WritePreparedTxn : public PessimisticTransaction {
 
   virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
 
-  // No copying allowed
-  WritePreparedTxn(const WritePreparedTxn&);
-  void operator=(const WritePreparedTxn&);
-
   WritePreparedTxnDB* wpt_db_;
   // Number of sub-batches in prepare
   size_t prepare_batch_cnt_ = 0;
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 8a7883c0504..504541ba018 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -5,26 +5,23 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/write_prepared_txn_db.h"
 
-#include <inttypes.h>
 #include <algorithm>
+#include <cinttypes>
 #include <string>
 #include <unordered_set>
 #include <vector>
 
-#include "db/db_impl.h"
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_impl/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
-#include "util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/transaction_db_mutex_impl.h"
 
@@ -33,15 +30,22 @@ namespace rocksdb {
 Status WritePreparedTxnDB::Initialize(
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles) {
-  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
+  auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
   assert(dbimpl != nullptr);
   auto rtxns = dbimpl->recovered_transactions();
+  std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
   for (auto rtxn : rtxns) {
     // There should only one batch for WritePrepared policy.
     assert(rtxn.second->batches_.size() == 1);
     const auto& seq = rtxn.second->batches_.begin()->first;
     const auto& batch_info = rtxn.second->batches_.begin()->second;
     auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
+    ordered_seq_cnt[seq] = cnt;
+  }
+  // AddPrepared must be called in order
+  for (auto seq_cnt : ordered_seq_cnt) {
+    auto seq = seq_cnt.first;
+    auto cnt = seq_cnt.second;
     for (size_t i = 0; i < cnt; i++) {
       AddPrepared(seq + i);
     }
@@ -65,8 +69,8 @@ Status WritePreparedTxnDB::Initialize(
     explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
         : db_(db) {}
     Status Callback(SequenceNumber commit_seq,
-                    bool is_mem_disabled __attribute__((__unused__)),
-                    uint64_t) override {
+                    bool is_mem_disabled __attribute__((__unused__)), uint64_t,
+                    size_t /*index*/, size_t /*total*/) override {
       assert(!is_mem_disabled);
       db_->AddCommitted(commit_seq, commit_seq);
       return Status::OK();
@@ -108,6 +112,18 @@ Transaction* WritePreparedTxnDB::BeginTransaction(
   }
 }
 
+Status WritePreparedTxnDB::Write(const WriteOptions& opts,
+                                 WriteBatch* updates) {
+  if (txn_db_options_.skip_concurrency_control) {
+    // Skip locking the rows
+    const size_t UNKNOWN_BATCH_CNT = 0;
+    WritePreparedTxn* NO_TXN = nullptr;
+    return WriteInternal(opts, updates, UNKNOWN_BATCH_CNT, NO_TXN);
+  } else {
+    return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates);
+  }
+}
+
 Status WritePreparedTxnDB::Write(
     const WriteOptions& opts,
     const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) {
@@ -123,7 +139,7 @@ Status WritePreparedTxnDB::Write(
   } else {
     // TODO(myabandeh): Make use of skip_duplicate_key_check hint
     // Fall back to unoptimized version
-    return PessimisticTransactionDB::Write(opts, updates);
+    return PessimisticTransactionDB::WriteWithConcurrencyControl(opts, updates);
   }
 }
 
@@ -151,11 +167,6 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
 
   bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
   WriteOptions write_options(write_options_orig);
-  bool sync = write_options.sync;
-  if (!do_one_write) {
-    // No need to sync on the first write
-    write_options.sync = false;
-  }
   // In the absence of Prepare markers, use Noop as a batch separator
   WriteBatchInternal::InsertNoop(batch);
   const bool DISABLE_MEMTABLE = true;
@@ -192,8 +203,6 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
   if (do_one_write) {
     return s;
   }  // else do the 2nd write for commit
-  // Set the original value of sync
-  write_options.sync = sync;
   ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
                     "CommitBatchInternal 2nd write prepare_seq: %" PRIu64,
                     prepare_seq);
@@ -203,17 +212,14 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
   WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
       this, db_impl_, prepare_seq, batch_cnt, ZERO_COMMITS);
   WriteBatch empty_batch;
-  empty_batch.PutLogData(Slice());
-  const size_t ONE_BATCH = 1;
-  // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  write_options.disableWAL = true;
+  write_options.sync = false;
+  const size_t ONE_BATCH = 1;  // Just to inc the seq
   s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr,
                           no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_prepare);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
-  // Note RemovePrepared should be called after WriteImpl that publishsed the
-  // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
-  RemovePrepared(prepare_seq, batch_cnt);
+  // Note: RemovePrepared is called from within PreReleaseCallback
   return s;
 }
 
@@ -221,16 +227,22 @@ Status WritePreparedTxnDB::Get(const ReadOptions& options,
                                ColumnFamilyHandle* column_family,
                                const Slice& key, PinnableSlice* value) {
   SequenceNumber min_uncommitted, snap_seq;
-  const bool backed_by_snapshot =
+  const SnapshotBackup backed_by_snapshot =
       AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
-  WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted);
+  WritePreparedTxnReadCallback callback(this, snap_seq, min_uncommitted,
+                                        backed_by_snapshot);
   bool* dont_care = nullptr;
-  auto res = db_impl_->GetImpl(options, column_family, key, value, dont_care,
-                               &callback);
-  if (LIKELY(
-          ValidateSnapshot(callback.max_visible_seq(), backed_by_snapshot))) {
+  DBImpl::GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = value;
+  get_impl_options.value_found = dont_care;
+  get_impl_options.callback = &callback;
+  auto res = db_impl_->GetImpl(options, key, get_impl_options);
+  if (LIKELY(callback.valid() && ValidateSnapshot(callback.max_visible_seq(),
+                                                  backed_by_snapshot))) {
     return res;
   } else {
+    WPRecordTick(TXN_GET_TRY_AGAIN);
     return Status::TryAgain();
   }
 }
@@ -293,7 +305,8 @@ struct WritePreparedTxnDB::IteratorState {
   IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
                 std::shared_ptr<ManagedSnapshot> s,
                 SequenceNumber min_uncommitted)
-      : callback(txn_db, sequence, min_uncommitted), snapshot(s) {}
+      : callback(txn_db, sequence, min_uncommitted, kBackedByDBSnapshot),
+        snapshot(s) {}
 
   WritePreparedTxnReadCallback callback;
   std::shared_ptr<ManagedSnapshot> snapshot;
@@ -387,10 +400,11 @@ void WritePreparedTxnDB::Init(const TransactionDBOptions& /* unused */) {
       new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {});
   commit_cache_ = std::unique_ptr<std::atomic<CommitEntry64b>[]>(
       new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {});
+  dummy_max_snapshot_.number_ = kMaxSequenceNumber;
 }
 
-void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max) {
-  prepared_mutex_.AssertHeld();
+void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max,
+                                                 bool locked) {
   // When max_evicted_seq_ advances, move older entries from prepared_txns_
   // to delayed_prepared_. This guarantees that if a seq is lower than max,
   // then it is not in prepared_txns_ and save an expensive, synchronized
@@ -401,25 +415,46 @@ void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max) {
       "CheckPreparedAgainstMax prepared_txns_.empty() %d top: %" PRIu64,
       prepared_txns_.empty(),
       prepared_txns_.empty() ? 0 : prepared_txns_.top());
-  while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
-    auto to_be_popped = prepared_txns_.top();
-    delayed_prepared_.insert(to_be_popped);
-    ROCKS_LOG_WARN(info_log_,
-                   "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
-                   " new_max=%" PRIu64,
-                   static_cast<uint64_t>(delayed_prepared_.size()),
-                   to_be_popped, new_max);
-    prepared_txns_.pop();
-    delayed_prepared_empty_.store(false, std::memory_order_release);
+  const SequenceNumber prepared_top = prepared_txns_.top();
+  const bool empty = prepared_top == kMaxSequenceNumber;
+  // Preliminary check to avoid the synchronization cost
+  if (!empty && prepared_top <= new_max) {
+    if (locked) {
+      // Needed to avoid double locking in pop().
+      prepared_txns_.push_pop_mutex()->Unlock();
+    }
+    WriteLock wl(&prepared_mutex_);
+    // Need to fetch fresh values of ::top after mutex is acquired
+    while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
+      auto to_be_popped = prepared_txns_.top();
+      delayed_prepared_.insert(to_be_popped);
+      ROCKS_LOG_WARN(info_log_,
+                     "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
+                     " new_max=%" PRIu64,
+                     static_cast<uint64_t>(delayed_prepared_.size()),
+                     to_be_popped, new_max);
+      delayed_prepared_empty_.store(false, std::memory_order_release);
+      // Update prepared_txns_ after updating delayed_prepared_empty_ otherwise
+      // there will be a point in time that the entry is neither in
+      // prepared_txns_ nor in delayed_prepared_, which will not be checked if
+      // delayed_prepared_empty_ is false.
+      prepared_txns_.pop();
+    }
+    if (locked) {
+      prepared_txns_.push_pop_mutex()->Lock();
+    }
   }
 }
 
-void WritePreparedTxnDB::AddPrepared(uint64_t seq) {
+void WritePreparedTxnDB::AddPrepared(uint64_t seq, bool locked) {
   ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Preparing with max %" PRIu64,
                     seq, max_evicted_seq_.load());
   TEST_SYNC_POINT("AddPrepared::begin:pause");
   TEST_SYNC_POINT("AddPrepared::begin:resume");
-  WriteLock wl(&prepared_mutex_);
+  if (!locked) {
+    prepared_txns_.push_pop_mutex()->Lock();
+  }
+  prepared_txns_.push_pop_mutex()->AssertHeld();
   prepared_txns_.push(seq);
   auto new_max = future_max_evicted_seq_.load();
   if (UNLIKELY(seq <= new_max)) {
@@ -429,7 +464,10 @@ void WritePreparedTxnDB::AddPrepared(uint64_t seq) {
         "Added prepare_seq is not larger than max_evicted_seq_: %" PRIu64
         " <= %" PRIu64,
         seq, new_max);
-    CheckPreparedAgainstMax(new_max);
+    CheckPreparedAgainstMax(new_max, true /*locked*/);
+  }
+  if (!locked) {
+    prepared_txns_.push_pop_mutex()->Unlock();
   }
   TEST_SYNC_POINT("AddPrepared::end");
 }
@@ -582,10 +620,7 @@ void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
              std::memory_order_relaxed)) {
   };
 
-  {
-    WriteLock wl(&prepared_mutex_);
-    CheckPreparedAgainstMax(new_max);
-  }
+  CheckPreparedAgainstMax(new_max, false /*locked*/);
 
   // With each change to max_evicted_seq_ fetch the live snapshots behind it.
   // We use max as the version of snapshots to identify how fresh are the
@@ -641,6 +676,7 @@ SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal(
   // than the smallest uncommitted seq when the snapshot was taken.
   auto min_uncommitted = WritePreparedTxnDB::SmallestUnCommittedSeq();
   SnapshotImpl* snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check);
+  TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:first");
   assert(snap_impl);
   SequenceNumber snap_seq = snap_impl->GetSequenceNumber();
   // Note: Check against future_max_evicted_seq_ (in contrast with
@@ -679,6 +715,7 @@ SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal(
       db_impl_->immutable_db_options().info_log,
       "GetSnapshot %" PRIu64 " ww:%" PRIi32 " min_uncommitted: %" PRIu64,
       snap_impl->GetSequenceNumber(), for_ww_conflict_check, min_uncommitted);
+  TEST_SYNC_POINT("WritePreparedTxnDB::GetSnapshotInternal:end");
   return snap_impl;
 }
 
@@ -798,7 +835,7 @@ void WritePreparedTxnDB::UpdateSnapshots(
   // afterwards.
   size_t i = 0;
   auto it = snapshots.begin();
-  for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; it++, i++) {
+  for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; ++it, ++i) {
     snapshot_cache_[i].store(*it, std::memory_order_release);
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", ++sync_i);
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i);
@@ -812,7 +849,7 @@ void WritePreparedTxnDB::UpdateSnapshots(
   }
 #endif
   snapshots_.clear();
-  for (; it != snapshots.end(); it++) {
+  for (; it != snapshots.end(); ++it) {
     // Insert them to a vector that is less efficient to access
     // concurrently
     snapshots_.push_back(*it);
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index 25b9b9a1b05..fdeaf879ee1 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -6,11 +6,7 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <inttypes.h>
+#include <cinttypes>
 #include <mutex>
 #include <queue>
 #include <set>
@@ -34,6 +30,7 @@
 #include "utilities/transactions/write_prepared_txn.h"
 
 namespace rocksdb {
+enum SnapshotBackup : bool { kUnbackedByDBSnapshot, kBackedByDBSnapshot };
 
 // A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC.
 // In this way some data in the DB might not be committed. The DB provides
@@ -72,6 +69,9 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
                                 const TransactionOptions& txn_options,
                                 Transaction* old_txn) override;
 
+  using TransactionDB::Write;
+  Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
   // Optimized version of ::Write that receives more optimization request such
   // as skip_concurrency_control.
   using PessimisticTransactionDB::Write;
@@ -325,10 +325,11 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
 
   // Add the transaction with prepare sequence seq to the prepared list.
   // Note: must be called serially with increasing seq on each call.
-  void AddPrepared(uint64_t seq);
+  // locked is true if prepared_mutex_ is already locked.
+  void AddPrepared(uint64_t seq, bool locked = false);
   // Check if any of the prepared txns are less than new max_evicted_seq_. Must
   // be called with prepared_mutex_ write locked.
-  void CheckPreparedAgainstMax(SequenceNumber new_max);
+  void CheckPreparedAgainstMax(SequenceNumber new_max, bool locked);
   // Remove the transaction with prepare sequence seq from the prepared list
   void RemovePrepared(const uint64_t seq, const size_t batch_cnt = 1);
   // Add the transaction with prepare sequence prepare_seq and commit sequence
@@ -448,20 +449,24 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
       const ColumnFamilyOptions& cf_options) override;
   // Assign the min and max sequence numbers for reading from the db. A seq >
   // max is not valid, and a seq < min is valid, and a min <= seq < max requires
-  // further checkings. Normally max is defined by the snapshot and min is by
+  // further checking. Normally max is defined by the snapshot and min is by
   // minimum uncommitted seq.
-  inline bool AssignMinMaxSeqs(const Snapshot* snapshot, SequenceNumber* min,
-                               SequenceNumber* max);
+  inline SnapshotBackup AssignMinMaxSeqs(const Snapshot* snapshot,
+                                         SequenceNumber* min,
+                                         SequenceNumber* max);
   // Validate is a snapshot sequence number is still valid based on the latest
   // db status. backed_by_snapshot specifies if the number is baked by an actual
   // snapshot object. order specified the memory order with which we load the
   // atomic variables: relax is enough for the default since we care about last
   // value seen by same thread.
   inline bool ValidateSnapshot(
-      const SequenceNumber snap_seq, const bool backed_by_snapshot,
+      const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot,
       std::memory_order order = std::memory_order_relaxed);
+  // Get a dummy snapshot that refers to kMaxSequenceNumber
+  Snapshot* GetMaxSnapshot() { return &dummy_max_snapshot_; }
 
  private:
+  friend class AddPreparedCallback;
   friend class PreparedHeap_BasicsTest_Test;
   friend class PreparedHeap_Concurrent_Test;
   friend class PreparedHeap_EmptyAtTheEnd_Test;
@@ -487,6 +492,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
   friend class WritePreparedTransactionTest_NewSnapshotLargerThanMax_Test;
   friend class WritePreparedTransactionTest_MaxCatchupWithNewSnapshot_Test;
+  friend class WritePreparedTransactionTest_MaxCatchupWithUnbackedSnapshot_Test;
   friend class
       WritePreparedTransactionTest_NonAtomicCommitOfDelayedPrepared_Test;
   friend class
@@ -494,6 +500,7 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test;
   friend class WritePreparedTransactionTest_OldCommitMapGC_Test;
   friend class WritePreparedTransactionTest_RollbackTest_Test;
+  friend class WritePreparedTransactionTest_SmallestUnCommittedSeq_Test;
   friend class WriteUnpreparedTxn;
   friend class WriteUnpreparedTxnDB;
   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
@@ -507,10 +514,13 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   // A heap with the amortized O(1) complexity for erase. It uses one extra heap
   // to keep track of erased entries that are not yet on top of the main heap.
   class PreparedHeap {
-    std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
-        heap_;
+    // The mutex is required for push and pop from PreparedHeap. ::erase will
+    // use external synchronization via prepared_mutex_.
+    port::Mutex push_pop_mutex_;
+    std::deque<uint64_t> heap_;
     std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
         erased_heap_;
+    std::atomic<uint64_t> heap_top_ = {kMaxSequenceNumber};
     // True when testing crash recovery
     bool TEST_CRASH_ = false;
     friend class WritePreparedTxnDB;
@@ -522,18 +532,33 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
         assert(erased_heap_.empty());
       }
     }
-    bool empty() { return heap_.empty(); }
-    uint64_t top() { return heap_.top(); }
-    void push(uint64_t v) { heap_.push(v); }
-    void pop() {
-      heap_.pop();
+    port::Mutex* push_pop_mutex() { return &push_pop_mutex_; }
+
+    inline bool empty() { return top() == kMaxSequenceNumber; }
+    // Returns kMaxSequenceNumber if empty() and the smallest otherwise.
+    inline uint64_t top() { return heap_top_.load(std::memory_order_acquire); }
+    inline void push(uint64_t v) {
+      push_pop_mutex_.AssertHeld();
+      if (heap_.empty()) {
+        heap_top_.store(v, std::memory_order_release);
+      } else {
+        assert(heap_top_.load() < v);
+      }
+      heap_.push_back(v);
+    }
+    void pop(bool locked = false) {
+      if (!locked) {
+        push_pop_mutex()->Lock();
+      }
+      push_pop_mutex_.AssertHeld();
+      heap_.pop_front();
       while (!heap_.empty() && !erased_heap_.empty() &&
              // heap_.top() > erased_heap_.top() could happen if we have erased
              // a non-existent entry. Ideally the user should not do that but we
              // should be resilient against it.
-             heap_.top() >= erased_heap_.top()) {
-        if (heap_.top() == erased_heap_.top()) {
-          heap_.pop();
+             heap_.front() >= erased_heap_.top()) {
+        if (heap_.front() == erased_heap_.top()) {
+          heap_.pop_front();
         }
         uint64_t erased __attribute__((__unused__));
         erased = erased_heap_.top();
@@ -544,15 +569,26 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
       while (heap_.empty() && !erased_heap_.empty()) {
         erased_heap_.pop();
       }
+      heap_top_.store(!heap_.empty() ? heap_.front() : kMaxSequenceNumber,
+                      std::memory_order_release);
+      if (!locked) {
+        push_pop_mutex()->Unlock();
+      }
     }
+    // Concurrrent calls needs external synchronization. It is safe to be called
+    // concurrent to push and pop though.
     void erase(uint64_t seq) {
-      if (!heap_.empty()) {
-        if (seq < heap_.top()) {
+      if (!empty()) {
+        auto top_seq = top();
+        if (seq < top_seq) {
           // Already popped, ignore it.
-        } else if (heap_.top() == seq) {
+        } else if (top_seq == seq) {
           pop();
-          assert(heap_.empty() || heap_.top() != seq);
-        } else {  // (heap_.top() > seq)
+#ifndef NDEBUG
+          MutexLock ml(push_pop_mutex());
+          assert(heap_.empty() || heap_.front() != seq);
+#endif
+        } else {  // top() > seq
           // Down the heap, remember to pop it later
           erased_heap_.push(seq);
         }
@@ -591,33 +627,49 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
                             const SequenceNumber& new_max);
 
   inline SequenceNumber SmallestUnCommittedSeq() {
+    // Note: We have two lists to look into, but for performance reasons they
+    // are not read atomically. Since CheckPreparedAgainstMax copies the entry
+    // to delayed_prepared_ before removing it from prepared_txns_, to ensure
+    // that a prepared entry will not go unmissed, we look into them in opposite
+    // order: first read prepared_txns_ and then delayed_prepared_.
+
+    // This must be called before calling ::top. This is because the concurrent
+    // thread would call ::RemovePrepared before updating
+    // GetLatestSequenceNumber(). Reading then in opposite order here guarantees
+    // that the ::top that we read would be lower the ::top if we had otherwise
+    // update/read them atomically.
+    auto next_prepare = db_impl_->GetLatestSequenceNumber() + 1;
+    auto min_prepare = prepared_txns_.top();
     // Since we update the prepare_heap always from the main write queue via
     // PreReleaseCallback, the prepared_txns_.top() indicates the smallest
     // prepared data in 2pc transactions. For non-2pc transactions that are
     // written in two steps, we also update prepared_txns_ at the first step
     // (via the same mechanism) so that their uncommitted data is reflected in
     // SmallestUnCommittedSeq.
-    ReadLock rl(&prepared_mutex_);
-    // Since we are holding the mutex, and GetLatestSequenceNumber is updated
-    // after prepared_txns_ are, the value of GetLatestSequenceNumber would
-    // reflect any uncommitted data that is not added to prepared_txns_ yet.
-    // Otherwise, if there is no concurrent txn, this value simply reflects that
-    // latest value in the memtable.
-    if (!delayed_prepared_.empty()) {
-      assert(!delayed_prepared_empty_.load());
-      return *delayed_prepared_.begin();
+    if (!delayed_prepared_empty_.load()) {
+      ReadLock rl(&prepared_mutex_);
+      if (!delayed_prepared_.empty()) {
+        return *delayed_prepared_.begin();
+      }
     }
-    if (prepared_txns_.empty()) {
-      return db_impl_->GetLatestSequenceNumber() + 1;
+    bool empty = min_prepare == kMaxSequenceNumber;
+    if (empty) {
+      // Since GetLatestSequenceNumber is updated
+      // after prepared_txns_ are, the value of GetLatestSequenceNumber would
+      // reflect any uncommitted data that is not added to prepared_txns_ yet.
+      // Otherwise, if there is no concurrent txn, this value simply reflects
+      // that latest value in the memtable.
+      return next_prepare;
     } else {
-      return std::min(prepared_txns_.top(),
-                      db_impl_->GetLatestSequenceNumber() + 1);
+      return std::min(min_prepare, next_prepare);
     }
   }
+
   // Enhance the snapshot object by recording in it the smallest uncommitted seq
   inline void EnhanceSnapshot(SnapshotImpl* snapshot,
                               SequenceNumber min_uncommitted) {
     assert(snapshot);
+    assert(min_uncommitted <= snapshot->number_ + 1);
     snapshot->min_uncommitted_ = min_uncommitted;
   }
 
@@ -743,26 +795,55 @@ class WritePreparedTxnDB : public PessimisticTransactionDB {
   // Thread safety: since the handle is read-only object it is a const it is
   // safe to read it concurrently
   std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> handle_map_;
+  // A dummy snapshot object that refers to kMaxSequenceNumber
+  SnapshotImpl dummy_max_snapshot_;
 };
 
 class WritePreparedTxnReadCallback : public ReadCallback {
  public:
   WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot)
-      : ReadCallback(snapshot), db_(db) {}
+      : ReadCallback(snapshot),
+        db_(db),
+        backed_by_snapshot_(kBackedByDBSnapshot) {}
   WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot,
-                               SequenceNumber min_uncommitted)
-      : ReadCallback(snapshot, min_uncommitted), db_(db) {}
+                               SequenceNumber min_uncommitted,
+                               SnapshotBackup backed_by_snapshot)
+      : ReadCallback(snapshot, min_uncommitted),
+        db_(db),
+        backed_by_snapshot_(backed_by_snapshot) {
+    (void)backed_by_snapshot_;  // to silence unused private field warning
+  }
+
+  virtual ~WritePreparedTxnReadCallback() {
+    // If it is not backed by snapshot, the caller must check validity
+    assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
+  }
 
   // Will be called to see if the seq number visible; if not it moves on to
   // the next seq number.
   inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override {
     auto snapshot = max_visible_seq_;
-    return db_->IsInSnapshot(seq, snapshot, min_uncommitted_);
+    bool snap_released = false;
+    auto ret =
+        db_->IsInSnapshot(seq, snapshot, min_uncommitted_, &snap_released);
+    assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot);
+    snap_released_ |= snap_released;
+    return ret;
+  }
+
+  inline bool valid() {
+    valid_checked_ = true;
+    return snap_released_ == false;
   }
 
   // TODO(myabandeh): override Refresh when Iterator::Refresh is supported
  private:
   WritePreparedTxnDB* db_;
+  // Whether max_visible_seq_ is backed by a snapshot
+  const SnapshotBackup backed_by_snapshot_;
+  bool snap_released_ = false;
+  // Safety check to ensure that the caller has checked invalid statuses
+  bool valid_checked_ = false;
 };
 
 class AddPreparedCallback : public PreReleaseCallback {
@@ -779,12 +860,28 @@ class AddPreparedCallback : public PreReleaseCallback {
   }
   virtual Status Callback(SequenceNumber prepare_seq,
                           bool is_mem_disabled __attribute__((__unused__)),
-                          uint64_t log_number) override {
+                          uint64_t log_number, size_t index,
+                          size_t total) override {
+    assert(index < total);
+    // To reduce the cost of lock acquisition competing with the concurrent
+    // prepare requests, lock on the first callback and unlock on the last.
+    const bool do_lock = !two_write_queues_ || index == 0;
+    const bool do_unlock = !two_write_queues_ || index + 1 == total;
     // Always Prepare from the main queue
     assert(!two_write_queues_ || !is_mem_disabled);  // implies the 1st queue
+    TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:pause");
+    TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::begin:resume");
+    if (do_lock) {
+      db_->prepared_txns_.push_pop_mutex()->Lock();
+    }
+    const bool kLocked = true;
     for (size_t i = 0; i < sub_batch_cnt_; i++) {
-      db_->AddPrepared(prepare_seq + i);
+      db_->AddPrepared(prepare_seq + i, kLocked);
+    }
+    if (do_unlock) {
+      db_->prepared_txns_.push_pop_mutex()->Unlock();
     }
+    TEST_SYNC_POINT("AddPreparedCallback::AddPrepared::end");
     if (first_prepare_batch_) {
       assert(log_number != 0);
       db_impl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(
@@ -827,7 +924,8 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
 
   virtual Status Callback(SequenceNumber commit_seq,
                           bool is_mem_disabled __attribute__((__unused__)),
-                          uint64_t) override {
+                          uint64_t, size_t /*index*/,
+                          size_t /*total*/) override {
     // Always commit from the 2nd queue
     assert(!db_impl_->immutable_db_options().two_write_queues ||
            is_mem_disabled);
@@ -864,6 +962,14 @@ class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
       // publish sequence numbers will be in order, i.e., once a seq is
       // published all the seq prior to that are also publishable.
       db_impl_->SetLastPublishedSequence(last_commit_seq);
+      // Note RemovePrepared should be called after publishing the seq.
+      // Otherwise SmallestUnCommittedSeq optimization breaks.
+      if (prep_seq_ != kMaxSequenceNumber) {
+        db_->RemovePrepared(prep_seq_, prep_batch_cnt_);
+      }  // else there was no prepare phase
+      if (includes_aux_batch_) {
+        db_->RemovePrepared(aux_seq_, aux_batch_cnt_);
+      }
     }
     // else SequenceNumber that is updated as part of the write already does the
     // publishing
@@ -908,8 +1014,8 @@ class WritePreparedRollbackPreReleaseCallback : public PreReleaseCallback {
     assert(prep_batch_cnt_ > 0);
   }
 
-  Status Callback(SequenceNumber commit_seq, bool is_mem_disabled,
-                  uint64_t) override {
+  Status Callback(SequenceNumber commit_seq, bool is_mem_disabled, uint64_t,
+                  size_t /*index*/, size_t /*total*/) override {
     // Always commit from the 2nd queue
     assert(is_mem_disabled);  // implies the 2nd queue
     assert(db_impl_->immutable_db_options().two_write_queues);
@@ -969,26 +1075,26 @@ struct SubBatchCounter : public WriteBatch::Handler {
   bool WriteAfterCommit() const override { return false; }
 };
 
-bool WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot,
-                                          SequenceNumber* min,
-                                          SequenceNumber* max) {
+SnapshotBackup WritePreparedTxnDB::AssignMinMaxSeqs(const Snapshot* snapshot,
+                                                    SequenceNumber* min,
+                                                    SequenceNumber* max) {
   if (snapshot != nullptr) {
     *min = static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
                ->min_uncommitted_;
     *max = static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
                ->number_;
-    return true;
+    return kBackedByDBSnapshot;
   } else {
     *min = SmallestUnCommittedSeq();
     *max = 0;  // to be assigned later after sv is referenced.
-    return false;
+    return kUnbackedByDBSnapshot;
   }
 }
 
-bool WritePreparedTxnDB::ValidateSnapshot(const SequenceNumber snap_seq,
-                                          const bool backed_by_snapshot,
-                                          std::memory_order order) {
-  if (backed_by_snapshot) {
+bool WritePreparedTxnDB::ValidateSnapshot(
+    const SequenceNumber snap_seq, const SnapshotBackup backed_by_snapshot,
+    std::memory_order order) {
+  if (backed_by_snapshot == kBackedByDBSnapshot) {
     return true;
   } else {
     SequenceNumber max = max_evicted_seq_.load(order);
diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index 9aee33b078f..af7b7694ddf 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -5,10 +5,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/transaction_test.h"
 #include "utilities/transactions/write_unprepared_txn.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
@@ -20,7 +16,8 @@ class WriteUnpreparedTransactionTestBase : public TransactionTestBase {
   WriteUnpreparedTransactionTestBase(bool use_stackable_db,
                                      bool two_write_queue,
                                      TxnDBWritePolicy write_policy)
-      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy){}
+      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
+                            kOrderedWrite) {}
 };
 
 class WriteUnpreparedTransactionTest
@@ -39,7 +36,31 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED),
                       std::make_tuple(false, true, WRITE_UNPREPARED)));
 
+enum StressAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT };
+class WriteUnpreparedStressTest : public WriteUnpreparedTransactionTestBase,
+                                  virtual public ::testing::WithParamInterface<
+                                      std::tuple<bool, StressAction>> {
+ public:
+  WriteUnpreparedStressTest()
+      : WriteUnpreparedTransactionTestBase(false, std::get<0>(GetParam()),
+                                           WRITE_UNPREPARED),
+        action_(std::get<1>(GetParam())) {}
+  StressAction action_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    WriteUnpreparedStressTest, WriteUnpreparedStressTest,
+    ::testing::Values(std::make_tuple(false, NO_SNAPSHOT),
+                      std::make_tuple(false, RO_SNAPSHOT),
+                      std::make_tuple(false, REFRESH_SNAPSHOT),
+                      std::make_tuple(true, NO_SNAPSHOT),
+                      std::make_tuple(true, RO_SNAPSHOT),
+                      std::make_tuple(true, REFRESH_SNAPSHOT)));
+
 TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
+  // The following tests checks whether reading your own write for
+  // a transaction works for write unprepared, when there are uncommitted
+  // values written into DB.
   auto verify_state = [](Iterator* iter, const std::string& key,
                          const std::string& value) {
     ASSERT_TRUE(iter->Valid());
@@ -48,156 +69,251 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
     ASSERT_EQ(value, iter->value().ToString());
   };
 
-  options.disable_auto_compactions = true;
-  ReOpen();
+  // Test always reseeking vs never reseeking.
+  for (uint64_t max_skip : {0, std::numeric_limits<int>::max()}) {
+    options.max_sequential_skip_in_iterations = max_skip;
+    options.disable_auto_compactions = true;
+    ReOpen();
 
-  // The following tests checks whether reading your own write for
-  // a transaction works for write unprepared, when there are uncommitted
-  // values written into DB.
-  //
-  // Although the values written by DB::Put are technically committed, we add
-  // their seq num to unprep_seqs_ to pretend that they were written into DB
-  // as part of an unprepared batch, and then check if they are visible to the
-  // transaction.
-  auto snapshot0 = db->GetSnapshot();
-  ASSERT_OK(db->Put(WriteOptions(), "a", "v1"));
-  ASSERT_OK(db->Put(WriteOptions(), "b", "v2"));
-  auto snapshot2 = db->GetSnapshot();
-  ASSERT_OK(db->Put(WriteOptions(), "a", "v3"));
-  ASSERT_OK(db->Put(WriteOptions(), "b", "v4"));
-  auto snapshot4 = db->GetSnapshot();
-  ASSERT_OK(db->Put(WriteOptions(), "a", "v5"));
-  ASSERT_OK(db->Put(WriteOptions(), "b", "v6"));
-  auto snapshot6 = db->GetSnapshot();
-  ASSERT_OK(db->Put(WriteOptions(), "a", "v7"));
-  ASSERT_OK(db->Put(WriteOptions(), "b", "v8"));
-  auto snapshot8 = db->GetSnapshot();
-
-  TransactionOptions txn_options;
-  WriteOptions write_options;
-  Transaction* txn = db->BeginTransaction(write_options, txn_options);
-  WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
-
-  ReadOptions roptions;
-  roptions.snapshot = snapshot0;
-
-  wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] =
-      snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber();
-  auto iter = txn->GetIterator(roptions);
-
-  // Test Get().
-  std::string value;
-
-  ASSERT_OK(txn->Get(roptions, Slice("a"), &value));
-  ASSERT_EQ(value, "v3");
-
-  ASSERT_OK(txn->Get(roptions, Slice("b"), &value));
-  ASSERT_EQ(value, "v4");
-
-  wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] =
-      snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+    TransactionOptions txn_options;
+    WriteOptions woptions;
+    ReadOptions roptions;
 
-  ASSERT_OK(txn->Get(roptions, Slice("a"), &value));
-  ASSERT_EQ(value, "v7");
+    ASSERT_OK(db->Put(woptions, "a", ""));
+    ASSERT_OK(db->Put(woptions, "b", ""));
 
-  ASSERT_OK(txn->Get(roptions, Slice("b"), &value));
-  ASSERT_EQ(value, "v8");
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+    txn->SetSnapshot();
 
-  wup_txn->unprep_seqs_.clear();
+    for (int i = 0; i < 5; i++) {
+      std::string stored_value = "v" + ToString(i);
+      ASSERT_OK(txn->Put("a", stored_value));
+      ASSERT_OK(txn->Put("b", stored_value));
+      wup_txn->FlushWriteBatchToDB(false);
 
-  // Test Next().
-  wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] =
-      snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+      // Test Get()
+      std::string value;
+      ASSERT_OK(txn->Get(roptions, "a", &value));
+      ASSERT_EQ(value, stored_value);
+      ASSERT_OK(txn->Get(roptions, "b", &value));
+      ASSERT_EQ(value, stored_value);
 
-  iter->Seek("a");
-  verify_state(iter, "a", "v3");
+      // Test Next()
+      auto iter = txn->GetIterator(roptions);
+      iter->Seek("a");
+      verify_state(iter, "a", stored_value);
 
-  iter->Next();
-  verify_state(iter, "b", "v4");
+      iter->Next();
+      verify_state(iter, "b", stored_value);
 
-  iter->SeekToFirst();
-  verify_state(iter, "a", "v3");
+      iter->SeekToFirst();
+      verify_state(iter, "a", stored_value);
 
-  iter->Next();
-  verify_state(iter, "b", "v4");
+      iter->Next();
+      verify_state(iter, "b", stored_value);
 
-  wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] =
-      snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+      delete iter;
 
-  iter->Seek("a");
-  verify_state(iter, "a", "v7");
+      // Test Prev()
+      iter = txn->GetIterator(roptions);
+      iter->SeekForPrev("b");
+      verify_state(iter, "b", stored_value);
 
-  iter->Next();
-  verify_state(iter, "b", "v8");
+      iter->Prev();
+      verify_state(iter, "a", stored_value);
 
-  iter->SeekToFirst();
-  verify_state(iter, "a", "v7");
+      iter->SeekToLast();
+      verify_state(iter, "b", stored_value);
 
-  iter->Next();
-  verify_state(iter, "b", "v8");
+      iter->Prev();
+      verify_state(iter, "a", stored_value);
 
-  wup_txn->unprep_seqs_.clear();
+      delete iter;
+    }
 
-  // Test Prev(). For Prev(), we need to adjust the snapshot to match what is
-  // possible in WriteUnpreparedTxn.
-  //
-  // Because of row locks and ValidateSnapshot, there cannot be any committed
-  // entries after snapshot, but before the first prepared key.
-  roptions.snapshot = snapshot2;
-  wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] =
-      snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+    delete txn;
+  }
+}
 
-  iter->SeekForPrev("b");
-  verify_state(iter, "b", "v4");
+#ifndef ROCKSDB_VALGRIND_RUN
+TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) {
+  // This is a stress test where different threads are writing random keys, and
+  // then before committing or aborting the transaction, it validates to see
+  // that it can read the keys it wrote, and the keys it did not write respect
+  // the snapshot. To avoid row lock contention (and simply stressing the
+  // locking system), each thread is mostly only writing to its own set of keys.
+  const uint32_t kNumIter = 1000;
+  const uint32_t kNumThreads = 10;
+  const uint32_t kNumKeys = 5;
+
+  std::default_random_engine rand(static_cast<uint32_t>(
+      std::hash<std::thread::id>()(std::this_thread::get_id())));
+
+  // Test with
+  // 1. no snapshots set
+  // 2. snapshot set on ReadOptions
+  // 3. snapshot set, and refreshing after every write.
+  StressAction a = action_;
+  WriteOptions write_options;
+  txn_db_options.transaction_lock_timeout = -1;
+  options.disable_auto_compactions = true;
+  ReOpen();
 
-  iter->Prev();
-  verify_state(iter, "a", "v3");
+  std::vector<std::string> keys;
+  for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) {
+    keys.push_back("k" + ToString(k));
+  }
+  std::shuffle(keys.begin(), keys.end(), rand);
+
+  // This counter will act as a "sequence number" to help us validate
+  // visibility logic with snapshots. If we had direct access to the seqno of
+  // snapshots and key/values, then we should directly compare those instead.
+  std::atomic<int64_t> counter(0);
+
+  std::function<void(uint32_t)> stress_thread = [&](int id) {
+    size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rnd(static_cast<uint32_t>(tid));
+
+    Transaction* txn;
+    TransactionOptions txn_options;
+    // batch_size of 1 causes writes to DB for every marker.
+    txn_options.write_batch_flush_threshold = 1;
+    ReadOptions read_options;
+
+    for (uint32_t i = 0; i < kNumIter; i++) {
+      std::set<std::string> owned_keys(&keys[id * kNumKeys],
+                                       &keys[(id + 1) * kNumKeys]);
+      // Add unowned keys to make the workload more interesting, but this
+      // increases row lock contention, so just do it sometimes.
+      if (rnd.OneIn(2)) {
+        owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]);
+      }
 
-  iter->SeekToLast();
-  verify_state(iter, "b", "v4");
+      txn = db->BeginTransaction(write_options, txn_options);
+      txn->SetName(ToString(id));
+      txn->SetSnapshot();
+      if (a >= RO_SNAPSHOT) {
+        read_options.snapshot = txn->GetSnapshot();
+        ASSERT_TRUE(read_options.snapshot != nullptr);
+      }
 
-  iter->Prev();
-  verify_state(iter, "a", "v3");
+      uint64_t buf[2];
+      buf[0] = id;
 
-  roptions.snapshot = snapshot6;
-  wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] =
-      snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber();
-  delete iter;
-  iter = txn->GetIterator(roptions);
+      // When scanning through the database, make sure that all unprepared
+      // keys have value >= snapshot and all other keys have value < snapshot.
+      int64_t snapshot_num = counter.fetch_add(1);
 
-  iter->SeekForPrev("b");
-  verify_state(iter, "b", "v8");
+      Status s;
+      for (const auto& key : owned_keys) {
+        buf[1] = counter.fetch_add(1);
+        s = txn->Put(key, Slice((const char*)buf, sizeof(buf)));
+        if (!s.ok()) {
+          break;
+        }
+        if (a == REFRESH_SNAPSHOT) {
+          txn->SetSnapshot();
+          read_options.snapshot = txn->GetSnapshot();
+          snapshot_num = counter.fetch_add(1);
+        }
+      }
 
-  iter->Prev();
-  verify_state(iter, "a", "v7");
+      // Failure is possible due to snapshot validation. In this case,
+      // rollback and move onto next iteration.
+      if (!s.ok()) {
+        ASSERT_TRUE(s.IsBusy());
+        ASSERT_OK(txn->Rollback());
+        delete txn;
+        continue;
+      }
 
-  iter->SeekToLast();
-  verify_state(iter, "b", "v8");
+      auto verify_key = [&owned_keys, &a, &id, &snapshot_num](
+                            const std::string& key, const std::string& value) {
+        if (owned_keys.count(key) > 0) {
+          ASSERT_EQ(value.size(), 16);
+
+          // Since this key is part of owned_keys, then this key must be
+          // unprepared by this transaction identified by 'id'
+          ASSERT_EQ(((int64_t*)value.c_str())[0], id);
+          if (a == REFRESH_SNAPSHOT) {
+            // If refresh snapshot is true, then the snapshot is refreshed
+            // after every Put(), meaning that the current snapshot in
+            // snapshot_num must be greater than the "seqno" of any keys
+            // written by the current transaction.
+            ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+          } else {
+            // If refresh snapshot is not on, then the snapshot was taken at
+            // the beginning of the transaction, meaning all writes must come
+            // after snapshot_num
+            ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num);
+          }
+        } else if (a >= RO_SNAPSHOT) {
+          // If this is not an unprepared key, just assert that the key
+          // "seqno" is smaller than the snapshot seqno.
+          ASSERT_EQ(value.size(), 16);
+          ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num);
+        }
+      };
+
+      // Validate Get()/Next()/Prev(). Do only one of them to save time, and
+      // reduce lock contention.
+      switch (rnd.Uniform(3)) {
+        case 0:  // Validate Get()
+        {
+          for (const auto& key : keys) {
+            std::string value;
+            s = txn->Get(read_options, Slice(key), &value);
+            if (!s.ok()) {
+              ASSERT_TRUE(s.IsNotFound());
+              ASSERT_EQ(owned_keys.count(key), 0);
+            } else {
+              verify_key(key, value);
+            }
+          }
+          break;
+        }
+        case 1:  // Validate Next()
+        {
+          Iterator* iter = txn->GetIterator(read_options);
+          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+            verify_key(iter->key().ToString(), iter->value().ToString());
+          }
+          delete iter;
+          break;
+        }
+        case 2:  // Validate Prev()
+        {
+          Iterator* iter = txn->GetIterator(read_options);
+          for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+            verify_key(iter->key().ToString(), iter->value().ToString());
+          }
+          delete iter;
+          break;
+        }
+        default:
+          ASSERT_TRUE(false);
+      }
 
-  iter->Prev();
-  verify_state(iter, "a", "v7");
+      if (rnd.OneIn(2)) {
+        ASSERT_OK(txn->Commit());
+      } else {
+        ASSERT_OK(txn->Rollback());
+      }
+      delete txn;
+    }
+  };
 
-  // Since the unprep_seqs_ data were faked for testing, we do not want the
-  // destructor for the transaction to be rolling back data that did not
-  // exist.
-  wup_txn->unprep_seqs_.clear();
+  std::vector<port::Thread> threads;
+  for (uint32_t i = 0; i < kNumThreads; i++) {
+    threads.emplace_back(stress_thread, i);
+  }
 
-  db->ReleaseSnapshot(snapshot0);
-  db->ReleaseSnapshot(snapshot2);
-  db->ReleaseSnapshot(snapshot4);
-  db->ReleaseSnapshot(snapshot6);
-  db->ReleaseSnapshot(snapshot8);
-  delete iter;
-  delete txn;
+  for (auto& t : threads) {
+    t.join();
+  }
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 // This tests how write unprepared behaves during recovery when the DB crashes
 // after a transaction has either been unprepared or prepared, and tests if
@@ -215,7 +331,7 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
 
   // batch_size of 1 causes writes to DB for every marker.
   for (size_t batch_size : {1, 1000000}) {
-    txn_options.max_write_batch_size = batch_size;
+    txn_options.write_batch_flush_threshold = batch_size;
     for (bool empty : {true, false}) {
       for (Action a : {UNPREPARED, ROLLBACK, COMMIT}) {
         for (int num_batches = 1; num_batches < 10; num_batches++) {
@@ -236,8 +352,12 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
           txn->SetName("xid");
           for (int i = 0; i < num_batches; i++) {
             ASSERT_OK(txn->Put("k" + ToString(i), "value" + ToString(i)));
-            if (txn_options.max_write_batch_size == 1) {
-              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
+            if (txn_options.write_batch_flush_threshold == 1) {
+              // WriteUnprepared will check write_batch_flush_threshold and
+              // possibly flush before appending to the write batch. No flush
+              // will happen at the first write because the batch is still
+              // empty, so after k puts, there should be k-1 flushed batches.
+              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i);
             } else {
               ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
             }
@@ -302,7 +422,7 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
 
   // batch_size of 1 causes writes to DB for every marker.
   for (size_t batch_size : {1, 1000000}) {
-    txn_options.max_write_batch_size = batch_size;
+    txn_options.write_batch_flush_threshold = batch_size;
     for (bool prepare : {false, true}) {
       for (bool commit : {false, true}) {
         ReOpen();
@@ -312,8 +432,12 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
 
         for (int i = 0; i < kNumKeys; i++) {
           txn->Put("k" + ToString(i), "v" + ToString(i));
-          if (txn_options.max_write_batch_size == 1) {
-            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
+          if (txn_options.write_batch_flush_threshold == 1) {
+            // WriteUnprepared will check write_batch_flush_threshold and
+            // possibly flush before appending to the write batch. No flush will
+            // happen at the first write because the batch is still empty, so
+            // after k puts, there should be k-1 flushed batches.
+            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i);
           } else {
             ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
           }
@@ -361,7 +485,7 @@ TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
   WriteOptions write_options;
   TransactionOptions txn_options;
   // batch_size of 1 causes writes to DB for every marker.
-  txn_options.max_write_batch_size = 1;
+  txn_options.write_batch_flush_threshold = 1;
   const int kNumKeys = 10;
 
   WriteOptions wopts;
@@ -431,6 +555,110 @@ TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
   }
 }
 
+TEST_P(WriteUnpreparedTransactionTest, NoSnapshotWrite) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  Transaction* txn = db->BeginTransaction(woptions, txn_options);
+
+  // Do some writes with no snapshot
+  ASSERT_OK(txn->Put("a", "a"));
+  ASSERT_OK(txn->Put("b", "b"));
+  ASSERT_OK(txn->Put("c", "c"));
+
+  // Test that it is still possible to create iterators after writes with no
+  // snapshot, if iterator snapshot is fresh enough.
+  ReadOptions roptions;
+  auto iter = txn->GetIterator(roptions);
+  int keys = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev(), keys++) {
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
+  }
+  ASSERT_EQ(keys, 3);
+
+  delete iter;
+  delete txn;
+}
+
+// Test whether write to a transaction while iterating is supported.
+TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  enum Action { DO_DELETE, DO_UPDATE };
+
+  for (Action a : {DO_DELETE, DO_UPDATE}) {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(db->Put(woptions, ToString(i), ToString(i)));
+    }
+
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    // write_batch_ now contains 1 key.
+    ASSERT_OK(txn->Put("9", "a"));
+
+    ReadOptions roptions;
+    auto iter = txn->GetIterator(roptions);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      if (iter->key() == "9") {
+        ASSERT_EQ(iter->value().ToString(), "a");
+      } else {
+        ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
+      }
+
+      if (a == DO_DELETE) {
+        ASSERT_OK(txn->Delete(iter->key()));
+      } else {
+        ASSERT_OK(txn->Put(iter->key(), "b"));
+      }
+    }
+
+    delete iter;
+    ASSERT_OK(txn->Commit());
+
+    iter = db->NewIterator(roptions);
+    if (a == DO_DELETE) {
+      // Check that db is empty.
+      iter->SeekToFirst();
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      int keys = 0;
+      // Check that all values are updated to b.
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next(), keys++) {
+        ASSERT_OK(iter->status());
+        ASSERT_EQ(iter->value().ToString(), "b");
+      }
+      ASSERT_EQ(keys, 100);
+    }
+
+    delete iter;
+    delete txn;
+  }
+}
+
+TEST_P(WriteUnpreparedTransactionTest, SavePoint) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  Transaction* txn = db->BeginTransaction(woptions, txn_options);
+  txn->SetSavePoint();
+  ASSERT_OK(txn->Put("a", "a"));
+  ASSERT_OK(txn->Put("b", "b"));
+  ASSERT_OK(txn->Commit());
+
+  ReadOptions roptions;
+  std::string value;
+  ASSERT_OK(txn->Get(roptions, "a", &value));
+  ASSERT_EQ(value, "a");
+  ASSERT_OK(txn->Get(roptions, "b", &value));
+  ASSERT_EQ(value, "b");
+  delete txn;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 823b12ea171..bcaeb1eae24 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -6,54 +6,47 @@
 #ifndef ROCKSDB_LITE
 
 #include "utilities/transactions/write_unprepared_txn.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "util/cast_util.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 namespace rocksdb {
 
 bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
-  auto unprep_seqs = txn_->GetUnpreparedSequenceNumbers();
-
   // Since unprep_seqs maps prep_seq => prepare_batch_cnt, to check if seq is
   // in unprep_seqs, we have to check if seq is equal to prep_seq or any of
   // the prepare_batch_cnt seq nums after it.
   //
   // TODO(lth): Can be optimized with std::lower_bound if unprep_seqs is
   // large.
-  for (const auto& it : unprep_seqs) {
+  for (const auto& it : unprep_seqs_) {
     if (it.first <= seq && seq < it.first + it.second) {
       return true;
     }
   }
 
-  return db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_);
-}
-
-SequenceNumber WriteUnpreparedTxnReadCallback::CalcMaxUnpreparedSequenceNumber(
-    WriteUnpreparedTxn* txn) {
-  auto unprep_seqs = txn->GetUnpreparedSequenceNumbers();
-  if (unprep_seqs.size()) {
-    return unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
-  }
-  return 0;
+  bool snap_released = false;
+  auto ret =
+      db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_, &snap_released);
+  assert(!snap_released || backed_by_snapshot_ == kUnbackedByDBSnapshot);
+  snap_released_ |= snap_released;
+  return ret;
 }
 
 WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
                                        const WriteOptions& write_options,
                                        const TransactionOptions& txn_options)
-    : WritePreparedTxn(txn_db, write_options, txn_options), wupt_db_(txn_db) {
-  max_write_batch_size_ = txn_options.max_write_batch_size;
-  // We set max bytes to zero so that we don't get a memory limit error.
-  // Instead of trying to keep write batch strictly under the size limit, we
-  // just flush to DB when the limit is exceeded in write unprepared, to avoid
-  // having retry logic. This also allows very big key-value pairs that exceed
-  // max bytes to succeed.
-  write_batch_.SetMaxBytes(0);
+    : WritePreparedTxn(txn_db, write_options, txn_options),
+      wupt_db_(txn_db),
+      last_log_number_(0),
+      recovered_txn_(false),
+      largest_validated_seq_(0) {
+  if (txn_options.write_batch_flush_threshold < 0) {
+    write_batch_flush_threshold_ =
+        txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold;
+  } else {
+    write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold;
+  }
 }
 
 WriteUnpreparedTxn::~WriteUnpreparedTxn() {
@@ -65,121 +58,232 @@ WriteUnpreparedTxn::~WriteUnpreparedTxn() {
     // We should rollback regardless of GetState, but some unit tests that
     // test crash recovery run the destructor assuming that rollback does not
     // happen, so that rollback during recovery can be exercised.
-    if (GetState() == STARTED) {
-      auto s __attribute__((__unused__)) = RollbackInternal();
-      // TODO(lth): Better error handling.
+    if (GetState() == STARTED || GetState() == LOCKS_STOLEN) {
+      auto s = RollbackInternal();
       assert(s.ok());
+      if (!s.ok()) {
+        ROCKS_LOG_FATAL(
+            wupt_db_->info_log_,
+            "Rollback of WriteUnprepared transaction failed in destructor: %s",
+            s.ToString().c_str());
+      }
       dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
           log_number_);
     }
   }
+
+  // Call tracked_keys_.clear() so that ~PessimisticTransaction does not
+  // try to unlock keys for recovered transactions.
+  if (recovered_txn_) {
+    tracked_keys_.clear();
+  }
 }
 
 void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
   PessimisticTransaction::Initialize(txn_options);
-  max_write_batch_size_ = txn_options.max_write_batch_size;
-  write_batch_.SetMaxBytes(0);
+  if (txn_options.write_batch_flush_threshold < 0) {
+    write_batch_flush_threshold_ =
+        txn_db_impl_->GetTxnDBOptions().default_write_batch_flush_threshold;
+  } else {
+    write_batch_flush_threshold_ = txn_options.write_batch_flush_threshold;
+  }
+
   unprep_seqs_.clear();
-  write_set_keys_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
+  recovered_txn_ = false;
+  largest_validated_seq_ = 0;
+  assert(active_iterators_.empty());
+  active_iterators_.clear();
+}
+
+Status WriteUnpreparedTxn::HandleWrite(std::function<Status()> do_write) {
+  Status s;
+  if (active_iterators_.empty()) {
+    s = MaybeFlushWriteBatchToDB();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  s = do_write();
+  if (s.ok()) {
+    if (snapshot_) {
+      largest_validated_seq_ =
+          std::max(largest_validated_seq_, snapshot_->GetSequenceNumber());
+    } else {
+      // TODO(lth): We should use the same number as tracked_at_seq in TryLock,
+      // because what is actually being tracked is the sequence number at which
+      // this key was locked at.
+      largest_validated_seq_ = db_impl_->GetLastPublishedSequence();
+    }
+  }
+  return s;
 }
 
 Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
                                const Slice& key, const Slice& value,
                                const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
                                const SliceParts& key, const SliceParts& value,
                                const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family,
                                  const Slice& key, const Slice& value,
                                  const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::Merge(column_family, key, value, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Merge(column_family, key, value,
+                                      assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
                                   const Slice& key, const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
                                   const SliceParts& key,
                                   const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
                                         const Slice& key,
                                         const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::SingleDelete(column_family, key,
+                                             assume_tracked);
+  });
 }
 
 Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
                                         const SliceParts& key,
                                         const bool assume_tracked) {
-  Status s = MaybeFlushWriteBatchToDB();
-  if (!s.ok()) {
-    return s;
-  }
-  return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked);
+  return HandleWrite([&]() {
+    return TransactionBaseImpl::SingleDelete(column_family, key,
+                                             assume_tracked);
+  });
+}
+
+// WriteUnpreparedTxn::RebuildFromWriteBatch is only called on recovery. For
+// WriteUnprepared, the write batches have already been written into the
+// database during WAL replay, so all we have to do is just to "retrack" the key
+// so that rollbacks are possible.
+//
+// Calling TryLock instead of TrackKey is also possible, but as an optimization,
+// recovered transactions do not hold locks on their keys. This follows the
+// implementation in PessimisticTransactionDB::Initialize where we set
+// skip_concurrency_control to true.
+Status WriteUnpreparedTxn::RebuildFromWriteBatch(WriteBatch* wb) {
+  struct TrackKeyHandler : public WriteBatch::Handler {
+    WriteUnpreparedTxn* txn_;
+    bool rollback_merge_operands_;
+
+    TrackKeyHandler(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
+        : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+      txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                     false /* read_only */, true /* exclusive */);
+      return Status::OK();
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                     false /* read_only */, true /* exclusive */);
+      return Status::OK();
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                     false /* read_only */, true /* exclusive */);
+      return Status::OK();
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+      if (rollback_merge_operands_) {
+        txn_->TrackKey(cf, key.ToString(), kMaxSequenceNumber,
+                       false /* read_only */, true /* exclusive */);
+      }
+      return Status::OK();
+    }
+
+    // Recovered batches do not contain 2PC markers.
+    Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkEndPrepare(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkNoop(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkCommit(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+  };
+
+  TrackKeyHandler handler(this,
+                          wupt_db_->txn_db_options_.rollback_merge_operands);
+  return wb->Iterate(&handler);
 }
 
 Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
   const bool kPrepared = true;
   Status s;
-  if (max_write_batch_size_ != 0 &&
-      write_batch_.GetDataSize() > max_write_batch_size_) {
+  if (write_batch_flush_threshold_ > 0 &&
+      write_batch_.GetWriteBatch()->Count() > 0 &&
+      write_batch_.GetDataSize() >
+          static_cast<size_t>(write_batch_flush_threshold_)) {
     assert(GetState() != PREPARED);
     s = FlushWriteBatchToDB(!kPrepared);
   }
   return s;
 }
 
-void WriteUnpreparedTxn::UpdateWriteKeySet(uint32_t cfid, const Slice& key) {
-  // TODO(lth): write_set_keys_ can just be a std::string instead of a vector.
-  write_set_keys_[cfid].push_back(key.ToString());
-}
-
 Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
-  if (name_.empty()) {
-    return Status::InvalidArgument("Cannot write to DB without SetName.");
+  // If the current write batch contains savepoints, then some special handling
+  // is required so that RollbackToSavepoint can work.
+  //
+  // RollbackToSavepoint is not supported after Prepare() is called, so only do
+  // this for unprepared batches.
+  if (!prepared && unflushed_save_points_ != nullptr &&
+      !unflushed_save_points_->empty()) {
+    return FlushWriteBatchWithSavePointToDB();
   }
 
-  // Update write_key_set_ for rollback purposes.
-  KeySetBuilder keyset_handler(
-      this, wupt_db_->txn_db_options_.rollback_merge_operands);
-  auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&keyset_handler);
-  assert(s.ok());
-  if (!s.ok()) {
-    return s;
+  return FlushWriteBatchToDBInternal(prepared);
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
+  if (name_.empty()) {
+    assert(!prepared);
+#ifndef NDEBUG
+    static std::atomic_ullong autogen_id{0};
+    // To avoid changing all tests to call SetName, just autogenerate one.
+    if (wupt_db_->txn_db_options_.autogenerate_name) {
+      SetName(std::string("autoxid") + ToString(autogen_id.fetch_add(1)));
+    } else
+#endif
+    {
+      return Status::InvalidArgument("Cannot write to DB without SetName.");
+    }
   }
 
   // TODO(lth): Reduce duplicate code with WritePrepared prepare logic.
@@ -207,11 +311,14 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
   // from the current transaction. This means that if log_number_ is set,
   // WriteImpl should not overwrite that value, so set log_used to nullptr if
   // log_number_ is already set.
-  uint64_t* log_used = log_number_ ? nullptr : &log_number_;
-  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
-                          /*callback*/ nullptr, log_used, /*log ref*/
+  auto s =
+      db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, &last_log_number_, /*log ref*/
                           0, !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
                           &add_prepared_callback);
+  if (log_number_ == 0) {
+    log_number_ = last_log_number_;
+  }
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   auto prepare_seq = seq_used;
 
@@ -227,13 +334,124 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
   // Reset transaction state.
   if (!prepared) {
     prepare_batch_cnt_ = 0;
-    write_batch_.Clear();
-    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    const bool kClear = true;
+    TransactionBaseImpl::InitWriteBatch(kClear);
   }
 
   return s;
 }
 
+Status WriteUnpreparedTxn::FlushWriteBatchWithSavePointToDB() {
+  assert(unflushed_save_points_ != nullptr &&
+         unflushed_save_points_->size() > 0);
+  assert(save_points_ != nullptr && save_points_->size() > 0);
+  assert(save_points_->size() >= unflushed_save_points_->size());
+
+  // Handler class for creating an unprepared batch from a savepoint.
+  struct SavePointBatchHandler : public WriteBatch::Handler {
+    WriteBatchWithIndex* wb_;
+    const std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+
+    SavePointBatchHandler(
+        WriteBatchWithIndex* wb,
+        const std::map<uint32_t, ColumnFamilyHandle*>& handles)
+        : wb_(wb), handles_(handles) {}
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      return wb_->Put(handles_.at(cf), key, value);
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      return wb_->Delete(handles_.at(cf), key);
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      return wb_->SingleDelete(handles_.at(cf), key);
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      return wb_->Merge(handles_.at(cf), key, value);
+    }
+
+    // The only expected 2PC marker is the initial Noop marker.
+    Status MarkNoop(bool empty_batch) override {
+      return empty_batch ? Status::OK() : Status::InvalidArgument();
+    }
+
+    Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+
+    Status MarkEndPrepare(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkCommit(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+  };
+
+  // The comparator of the default cf is passed in, similar to the
+  // initialization of TransactionBaseImpl::write_batch_. This comparator is
+  // only used if the write batch encounters an invalid cf id, and falls back to
+  // this comparator.
+  WriteBatchWithIndex wb(wpt_db_->DefaultColumnFamily()->GetComparator(), 0,
+                         true, 0);
+  // Swap with write_batch_ so that wb contains the complete write batch. The
+  // actual write batch that will be flushed to DB will be built in
+  // write_batch_, and will be read by FlushWriteBatchToDBInternal.
+  std::swap(wb, write_batch_);
+  TransactionBaseImpl::InitWriteBatch();
+
+  size_t prev_boundary = WriteBatchInternal::kHeader;
+  const bool kPrepared = true;
+  for (size_t i = 0; i < unflushed_save_points_->size() + 1; i++) {
+    bool trailing_batch = i == unflushed_save_points_->size();
+    SavePointBatchHandler sp_handler(&write_batch_,
+                                     *wupt_db_->GetCFHandleMap().get());
+    size_t curr_boundary = trailing_batch ? wb.GetWriteBatch()->GetDataSize()
+                                          : (*unflushed_save_points_)[i];
+
+    // Construct the partial write batch up to the savepoint.
+    //
+    // Theoretically, a memcpy between the write batches should be sufficient
+    // since the rewriting into the batch should produce the exact same byte
+    // representation. Rebuilding the WriteBatchWithIndex index is still
+    // necessary though, and would imply doing two passes over the batch though.
+    Status s = WriteBatchInternal::Iterate(wb.GetWriteBatch(), &sp_handler,
+                                           prev_boundary, curr_boundary);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (write_batch_.GetWriteBatch()->Count() > 0) {
+      // Flush the write batch.
+      s = FlushWriteBatchToDBInternal(!kPrepared);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    if (!trailing_batch) {
+      if (flushed_save_points_ == nullptr) {
+        flushed_save_points_.reset(
+            new autovector<WriteUnpreparedTxn::SavePoint>());
+      }
+      flushed_save_points_->emplace_back(
+          unprep_seqs_, new ManagedSnapshot(db_impl_, wupt_db_->GetSnapshot()));
+    }
+
+    prev_boundary = curr_boundary;
+    const bool kClear = true;
+    TransactionBaseImpl::InitWriteBatch(kClear);
+  }
+
+  unflushed_save_points_->clear();
+  return Status::OK();
+}
+
 Status WriteUnpreparedTxn::PrepareInternal() {
   const bool kPrepared = true;
   return FlushWriteBatchToDB(kPrepared);
@@ -285,23 +503,30 @@ Status WriteUnpreparedTxn::CommitInternal() {
   const bool disable_memtable = !includes_data;
   const bool do_one_write =
       !db_impl_->immutable_db_options().two_write_queues || disable_memtable;
-  const bool publish_seq = do_one_write;
-  // Note: CommitTimeWriteBatch does not need AddPrepared since it is written to
-  // DB in one shot. min_uncommitted still works since it requires capturing
-  // data that is written to DB but not yet committed, while
-  // CommitTimeWriteBatch commits with PreReleaseCallback.
+
   WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
-      wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt, publish_seq);
+      wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt);
+  const bool kFirstPrepareBatch = true;
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, commit_batch_cnt,
+      db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+  PreReleaseCallback* pre_release_callback;
+  if (do_one_write) {
+    pre_release_callback = &update_commit_map;
+  } else {
+    pre_release_callback = &add_prepared_callback;
+  }
   uint64_t seq_used = kMaxSequenceNumber;
-  // Since the prepared batch is directly written to memtable, there is already
-  // a connection between the memtable and its WAL, so there is no need to
-  // redundantly reference the log that contains the prepared data.
+  // Since the prepared batch is directly written to memtable, there is
+  // already a connection between the memtable and its WAL, so there is no
+  // need to redundantly reference the log that contains the prepared data.
   const uint64_t zero_log_number = 0ull;
   size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
   auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
                                zero_log_number, disable_memtable, &seq_used,
-                               batch_cnt, &update_commit_map);
+                               batch_cnt, pre_release_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  const SequenceNumber commit_batch_seq = seq_used;
   if (LIKELY(do_one_write || !s.ok())) {
     if (LIKELY(s.ok())) {
       // Note RemovePrepared should be called after WriteImpl that publishsed
@@ -310,30 +535,26 @@ Status WriteUnpreparedTxn::CommitInternal() {
         wpt_db_->RemovePrepared(seq.first, seq.second);
       }
     }
+    if (UNLIKELY(!do_one_write)) {
+      wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+    }
     unprep_seqs_.clear();
-    write_set_keys_.clear();
+    flushed_save_points_.reset(nullptr);
+    unflushed_save_points_.reset(nullptr);
     return s;
   }  // else do the 2nd write to publish seq
+
+  // Populate unprep_seqs_ with commit_batch_seq, since we treat data in the
+  // commit write batch as just another "unprepared" batch. This will also
+  // update the unprep_seqs_ in the update_commit_map callback.
+  unprep_seqs_[commit_batch_seq] = commit_batch_cnt;
+
   // Note: the 2nd write comes with a performance penality. So if we have too
   // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
   // enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
   // two_write_queues should be disabled to avoid many additional writes here.
-  class PublishSeqPreReleaseCallback : public PreReleaseCallback {
-   public:
-    explicit PublishSeqPreReleaseCallback(DBImpl* db_impl)
-        : db_impl_(db_impl) {}
-    Status Callback(SequenceNumber seq,
-                    bool is_mem_disabled __attribute__((__unused__)),
-                    uint64_t) override {
-      assert(is_mem_disabled);
-      assert(db_impl_->immutable_db_options().two_write_queues);
-      db_impl_->SetLastPublishedSequence(seq);
-      return Status::OK();
-    }
 
-   private:
-    DBImpl* db_impl_;
-  } publish_seq_callback(db_impl_);
+  // Update commit map only from the 2nd queue
   WriteBatch empty_batch;
   empty_batch.PutLogData(Slice());
   // In the absence of Prepare markers, use Noop as a batch separator
@@ -343,7 +564,7 @@ Status WriteUnpreparedTxn::CommitInternal() {
   const uint64_t NO_REF_LOG = 0;
   s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
-                          &publish_seq_callback);
+                          &update_commit_map);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   // Note RemovePrepared should be called after WriteImpl that publishsed the
   // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
@@ -351,7 +572,8 @@ Status WriteUnpreparedTxn::CommitInternal() {
     wpt_db_->RemovePrepared(seq.first, seq.second);
   }
   unprep_seqs_.clear();
-  write_set_keys_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
   return s;
 }
 
@@ -361,24 +583,31 @@ Status WriteUnpreparedTxn::RollbackInternal() {
       wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0);
   assert(GetId() != kMaxSequenceNumber);
   assert(GetId() > 0);
+  Status s;
   const auto& cf_map = *wupt_db_->GetCFHandleMap();
   auto read_at_seq = kMaxSequenceNumber;
-  Status s;
-
   ReadOptions roptions;
+  // to prevent callback's seq to be overrriden inside DBImpk::Get
+  roptions.snapshot = wpt_db_->GetMaxSnapshot();
   // Note that we do not use WriteUnpreparedTxnReadCallback because we do not
   // need to read our own writes when reading prior versions of the key for
   // rollback.
+  const auto& tracked_keys = GetTrackedKeys();
   WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq);
-  for (const auto& cfkey : write_set_keys_) {
+  for (const auto& cfkey : tracked_keys) {
     const auto cfid = cfkey.first;
     const auto& keys = cfkey.second;
-    for (const auto& key : keys) {
+    for (const auto& pair : keys) {
+      const auto& key = pair.first;
       const auto& cf_handle = cf_map.at(cfid);
       PinnableSlice pinnable_val;
       bool not_used;
-      s = db_impl_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
-                            &callback);
+      DBImpl::GetImplOptions get_impl_options;
+      get_impl_options.column_family = cf_handle;
+      get_impl_options.value = &pinnable_val;
+      get_impl_options.value_found = &not_used;
+      get_impl_options.callback = &callback;
+      s = db_impl_->GetImpl(roptions, key, get_impl_options);
 
       if (s.ok()) {
         s = rollback_batch.Put(cf_handle, key, pinnable_val);
@@ -428,7 +657,8 @@ Status WriteUnpreparedTxn::RollbackInternal() {
       wpt_db_->RemovePrepared(seq.first, seq.second);
     }
     unprep_seqs_.clear();
-    write_set_keys_.clear();
+    flushed_save_points_.reset(nullptr);
+    unflushed_save_points_.reset(nullptr);
     return s;
   }  // else do the 2nd write for commit
   uint64_t& prepare_seq = seq_used;
@@ -455,27 +685,203 @@ Status WriteUnpreparedTxn::RollbackInternal() {
   }
 
   unprep_seqs_.clear();
-  write_set_keys_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
   return s;
 }
 
+void WriteUnpreparedTxn::Clear() {
+  if (!recovered_txn_) {
+    txn_db_impl_->UnLock(this, &GetTrackedKeys());
+  }
+  unprep_seqs_.clear();
+  flushed_save_points_.reset(nullptr);
+  unflushed_save_points_.reset(nullptr);
+  recovered_txn_ = false;
+  largest_validated_seq_ = 0;
+  assert(active_iterators_.empty());
+  active_iterators_.clear();
+  TransactionBaseImpl::Clear();
+}
+
+void WriteUnpreparedTxn::SetSavePoint() {
+  assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+             (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+         (save_points_ ? save_points_->size() : 0));
+  PessimisticTransaction::SetSavePoint();
+  if (unflushed_save_points_ == nullptr) {
+    unflushed_save_points_.reset(new autovector<size_t>());
+  }
+  unflushed_save_points_->push_back(write_batch_.GetDataSize());
+}
+
+Status WriteUnpreparedTxn::RollbackToSavePoint() {
+  assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+             (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+         (save_points_ ? save_points_->size() : 0));
+  if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) {
+    Status s = PessimisticTransaction::RollbackToSavePoint();
+    assert(!s.IsNotFound());
+    unflushed_save_points_->pop_back();
+    return s;
+  }
+
+  if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) {
+    return RollbackToSavePointInternal();
+  }
+
+  return Status::NotFound();
+}
+
+Status WriteUnpreparedTxn::RollbackToSavePointInternal() {
+  Status s;
+
+  const bool kClear = true;
+  TransactionBaseImpl::InitWriteBatch(kClear);
+
+  assert(flushed_save_points_->size() > 0);
+  WriteUnpreparedTxn::SavePoint& top = flushed_save_points_->back();
+
+  assert(save_points_ != nullptr && save_points_->size() > 0);
+  const TransactionKeyMap& tracked_keys = save_points_->top().new_keys_;
+
+  // TODO(lth): Reduce duplicate code with RollbackInternal logic.
+  ReadOptions roptions;
+  roptions.snapshot = top.snapshot_->snapshot();
+  SequenceNumber min_uncommitted =
+      static_cast_with_check<const SnapshotImpl, const Snapshot>(
+          roptions.snapshot)
+          ->min_uncommitted_;
+  SequenceNumber snap_seq = roptions.snapshot->GetSequenceNumber();
+  WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+                                          top.unprep_seqs_,
+                                          kBackedByDBSnapshot);
+  const auto& cf_map = *wupt_db_->GetCFHandleMap();
+  for (const auto& cfkey : tracked_keys) {
+    const auto cfid = cfkey.first;
+    const auto& keys = cfkey.second;
+
+    for (const auto& pair : keys) {
+      const auto& key = pair.first;
+      const auto& cf_handle = cf_map.at(cfid);
+      PinnableSlice pinnable_val;
+      bool not_used;
+      DBImpl::GetImplOptions get_impl_options;
+      get_impl_options.column_family = cf_handle;
+      get_impl_options.value = &pinnable_val;
+      get_impl_options.value_found = &not_used;
+      get_impl_options.callback = &callback;
+      s = db_impl_->GetImpl(roptions, key, get_impl_options);
+
+      if (s.ok()) {
+        s = write_batch_.Put(cf_handle, key, pinnable_val);
+        assert(s.ok());
+      } else if (s.IsNotFound()) {
+        s = write_batch_.Delete(cf_handle, key);
+        assert(s.ok());
+      } else {
+        return s;
+      }
+    }
+  }
+
+  const bool kPrepared = true;
+  s = FlushWriteBatchToDBInternal(!kPrepared);
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+
+  // PessimisticTransaction::RollbackToSavePoint will call also call
+  // RollbackToSavepoint on write_batch_. However, write_batch_ is empty and has
+  // no savepoints because this savepoint has already been flushed. Work around
+  // this by setting a fake savepoint.
+  write_batch_.SetSavePoint();
+  s = PessimisticTransaction::RollbackToSavePoint();
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+
+  flushed_save_points_->pop_back();
+  return s;
+}
+
+Status WriteUnpreparedTxn::PopSavePoint() {
+  assert((unflushed_save_points_ ? unflushed_save_points_->size() : 0) +
+             (flushed_save_points_ ? flushed_save_points_->size() : 0) ==
+         (save_points_ ? save_points_->size() : 0));
+  if (unflushed_save_points_ != nullptr && unflushed_save_points_->size() > 0) {
+    Status s = PessimisticTransaction::PopSavePoint();
+    assert(!s.IsNotFound());
+    unflushed_save_points_->pop_back();
+    return s;
+  }
+
+  if (flushed_save_points_ != nullptr && !flushed_save_points_->empty()) {
+    // PessimisticTransaction::PopSavePoint will call also call PopSavePoint on
+    // write_batch_. However, write_batch_ is empty and has no savepoints
+    // because this savepoint has already been flushed. Work around this by
+    // setting a fake savepoint.
+    write_batch_.SetSavePoint();
+    Status s = PessimisticTransaction::PopSavePoint();
+    assert(!s.IsNotFound());
+    flushed_save_points_->pop_back();
+    return s;
+  }
+
+  return Status::NotFound();
+}
+
+void WriteUnpreparedTxn::MultiGet(const ReadOptions& options,
+                                  ColumnFamilyHandle* column_family,
+                                  const size_t num_keys, const Slice* keys,
+                                  PinnableSlice* values, Status* statuses,
+                                  const bool sorted_input) {
+  SequenceNumber min_uncommitted, snap_seq;
+  const SnapshotBackup backed_by_snapshot =
+      wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
+  WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+                                          unprep_seqs_, backed_by_snapshot);
+  write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys,
+                                      keys, values, statuses, sorted_input,
+                                      &callback);
+  if (UNLIKELY(!callback.valid() ||
+               !wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+    wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+    for (size_t i = 0; i < num_keys; i++) {
+      statuses[i] = Status::TryAgain();
+    }
+  }
+}
+
 Status WriteUnpreparedTxn::Get(const ReadOptions& options,
                                ColumnFamilyHandle* column_family,
                                const Slice& key, PinnableSlice* value) {
   SequenceNumber min_uncommitted, snap_seq;
-  const bool backed_by_snapshot =
+  const SnapshotBackup backed_by_snapshot =
       wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
   WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
-                                          this);
+                                          unprep_seqs_, backed_by_snapshot);
   auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
                                             value, &callback);
-  if (LIKELY(wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
+  if (LIKELY(callback.valid() &&
+             wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) {
     return res;
   } else {
+    wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
     return Status::TryAgain();
   }
 }
 
+namespace {
+static void CleanupWriteUnpreparedWBWIIterator(void* arg1, void* arg2) {
+  auto txn = reinterpret_cast<WriteUnpreparedTxn*>(arg1);
+  auto iter = reinterpret_cast<Iterator*>(arg2);
+  txn->RemoveActiveIterator(iter);
+}
+}  // anonymous namespace
+
 Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options) {
   return GetIterator(options, wupt_db_->DefaultColumnFamily());
 }
@@ -486,7 +892,43 @@ Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options,
   Iterator* db_iter = wupt_db_->NewIterator(options, column_family, this);
   assert(db_iter);
 
-  return write_batch_.NewIteratorWithBase(column_family, db_iter);
+  auto iter = write_batch_.NewIteratorWithBase(column_family, db_iter);
+  active_iterators_.push_back(iter);
+  iter->RegisterCleanup(CleanupWriteUnpreparedWBWIIterator, this, iter);
+  return iter;
+}
+
+Status WriteUnpreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                            const Slice& key,
+                                            SequenceNumber* tracked_at_seq) {
+  // TODO(lth): Reduce duplicate code with WritePrepared ValidateSnapshot logic.
+  assert(snapshot_);
+
+  SequenceNumber min_uncommitted =
+      static_cast_with_check<const SnapshotImpl, const Snapshot>(
+          snapshot_.get())
+          ->min_uncommitted_;
+  SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
+  // tracked_at_seq is either max or the last snapshot with which this key was
+  // trackeed so there is no need to apply the IsInSnapshot to this comparison
+  // here as tracked_at_seq is not a prepare seq.
+  if (*tracked_at_seq <= snap_seq) {
+    // If the key has been previous validated at a sequence number earlier
+    // than the curent snapshot's sequence number, we already know it has not
+    // been modified.
+    return Status::OK();
+  }
+
+  *tracked_at_seq = snap_seq;
+
+  ColumnFamilyHandle* cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+
+  WriteUnpreparedTxnReadCallback snap_checker(
+      wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, kBackedByDBSnapshot);
+  return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
+                                               snap_seq, false /* cache_only */,
+                                               &snap_checker, min_uncommitted);
 }
 
 const std::map<SequenceNumber, size_t>&
diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h
index 751d36c23b9..f39f39891f1 100644
--- a/utilities/transactions/write_unprepared_txn.h
+++ b/utilities/transactions/write_unprepared_txn.h
@@ -17,27 +17,69 @@ namespace rocksdb {
 class WriteUnpreparedTxnDB;
 class WriteUnpreparedTxn;
 
+// WriteUnprepared transactions needs to be able to read their own uncommitted
+// writes, and supporting this requires some careful consideration. Because
+// writes in the current transaction may be flushed to DB already, we cannot
+// rely on the contents of WriteBatchWithIndex to determine whether a key should
+// be visible or not, so we have to remember to check the DB for any uncommitted
+// keys that should be visible to us. First, we will need to change the seek to
+// snapshot logic, to seek to max_visible_seq = max(snap_seq, max_unprep_seq).
+// Any key greater than max_visible_seq should not be visible because they
+// cannot be unprepared by the current transaction and they are not in its
+// snapshot.
+//
+// When we seek to max_visible_seq, one of these cases will happen:
+// 1. We hit a unprepared key from the current transaction.
+// 2. We hit a unprepared key from the another transaction.
+// 3. We hit a committed key with snap_seq < seq < max_unprep_seq.
+// 4. We hit a committed key with seq <= snap_seq.
+//
+// IsVisibleFullCheck handles all cases correctly.
+//
+// Other notes:
+// Note that max_visible_seq is only calculated once at iterator construction
+// time, meaning if the same transaction is adding more unprep seqs through
+// writes during iteration, these newer writes may not be visible. This is not a
+// problem for MySQL though because it avoids modifying the index as it is
+// scanning through it to avoid the Halloween Problem. Instead, it scans the
+// index once up front, and modifies based on a temporary copy.
+//
+// In DBIter, there is a "reseek" optimization if the iterator skips over too
+// many keys. However, this assumes that the reseek seeks exactly to the
+// required key. In write unprepared, even after seeking directly to
+// max_visible_seq, some iteration may be required before hitting a visible key,
+// and special precautions must be taken to avoid performing another reseek,
+// leading to an infinite loop.
+//
 class WriteUnpreparedTxnReadCallback : public ReadCallback {
  public:
-  WriteUnpreparedTxnReadCallback(WritePreparedTxnDB* db,
-                                 SequenceNumber snapshot,
-                                 SequenceNumber min_uncommitted,
-                                 WriteUnpreparedTxn* txn)
+  WriteUnpreparedTxnReadCallback(
+      WritePreparedTxnDB* db, SequenceNumber snapshot,
+      SequenceNumber min_uncommitted,
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      SnapshotBackup backed_by_snapshot)
       // Pass our last uncommitted seq as the snapshot to the parent class to
       // ensure that the parent will not prematurely filter out own writes. We
-      // will do the exact comparison agaisnt snapshots in IsVisibleFullCheck
+      // will do the exact comparison against snapshots in IsVisibleFullCheck
       // override.
-      : ReadCallback(CalcMaxVisibleSeq(txn, snapshot), min_uncommitted),
+      : ReadCallback(CalcMaxVisibleSeq(unprep_seqs, snapshot), min_uncommitted),
         db_(db),
-        txn_(txn),
-        wup_snapshot_(snapshot) {}
+        unprep_seqs_(unprep_seqs),
+        wup_snapshot_(snapshot),
+        backed_by_snapshot_(backed_by_snapshot) {
+    (void)backed_by_snapshot_;  // to silence unused private field warning
+  }
+
+  virtual ~WriteUnpreparedTxnReadCallback() {
+    // If it is not backed by snapshot, the caller must check validity
+    assert(valid_checked_ || backed_by_snapshot_ == kBackedByDBSnapshot);
+  }
 
   virtual bool IsVisibleFullCheck(SequenceNumber seq) override;
 
-  bool CanReseekToSkip() override {
-    return wup_snapshot_ == max_visible_seq_;
-    // Otherwise our own writes uncommitted are in db, and the assumptions
-    // behind reseek optimizations are no longer valid.
+  inline bool valid() {
+    valid_checked_ = true;
+    return snap_released_ == false;
   }
 
   void Refresh(SequenceNumber seq) override {
@@ -45,17 +87,26 @@ class WriteUnpreparedTxnReadCallback : public ReadCallback {
     wup_snapshot_ = seq;
   }
 
- private:
-  static SequenceNumber CalcMaxVisibleSeq(WriteUnpreparedTxn* txn,
-                                          SequenceNumber snapshot_seq) {
-    SequenceNumber max_unprepared = CalcMaxUnpreparedSequenceNumber(txn);
+  static SequenceNumber CalcMaxVisibleSeq(
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      SequenceNumber snapshot_seq) {
+    SequenceNumber max_unprepared = 0;
+    if (unprep_seqs.size()) {
+      max_unprepared =
+          unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
+    }
     return std::max(max_unprepared, snapshot_seq);
   }
-  static SequenceNumber CalcMaxUnpreparedSequenceNumber(
-      WriteUnpreparedTxn* txn);
+
+ private:
   WritePreparedTxnDB* db_;
-  WriteUnpreparedTxn* txn_;
+  const std::map<SequenceNumber, size_t>& unprep_seqs_;
   SequenceNumber wup_snapshot_;
+  // Whether max_visible_seq_ is backed by a snapshot
+  const SnapshotBackup backed_by_snapshot_;
+  bool snap_released_ = false;
+  // Safety check to ensure that the caller has checked invalid statuses
+  bool valid_checked_ = false;
 };
 
 class WriteUnpreparedTxn : public WritePreparedTxn {
@@ -94,19 +145,26 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
                               const SliceParts& key,
                               const bool assume_tracked = false) override;
 
-  virtual Status RebuildFromWriteBatch(WriteBatch*) override {
-    // This function was only useful for recovering prepared transactions, but
-    // is unused for write prepared because a transaction may consist of
-    // multiple write batches.
-    //
-    // If there are use cases outside of recovery that can make use of this,
-    // then support could be added.
-    return Status::NotSupported("Not supported for WriteUnprepared");
-  }
+  // In WriteUnprepared, untracked writes will break snapshot validation logic.
+  // Snapshot validation will only check the largest sequence number of a key to
+  // see if it was committed or not. However, an untracked unprepared write will
+  // hide smaller committed sequence numbers.
+  //
+  // TODO(lth): Investigate whether it is worth having snapshot validation
+  // validate all values larger than snap_seq. Otherwise, we should return
+  // Status::NotSupported for untracked writes.
 
-  const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
+  virtual Status RebuildFromWriteBatch(WriteBatch*) override;
 
-  void UpdateWriteKeySet(uint32_t cfid, const Slice& key);
+  virtual uint64_t GetLastLogNumber() const override {
+    return last_log_number_;
+  }
+
+  void RemoveActiveIterator(Iterator* iter) {
+    active_iterators_.erase(
+        std::remove(active_iterators_.begin(), active_iterators_.end(), iter),
+        active_iterators_.end());
+  }
 
  protected:
   void Initialize(const TransactionOptions& txn_options) override;
@@ -118,6 +176,12 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
 
   Status RollbackInternal() override;
 
+  void Clear() override;
+
+  void SetSavePoint() override;
+  Status RollbackToSavePoint() override;
+  Status PopSavePoint() override;
+
   // Get and GetIterator needs to be overridden so that a ReadCallback to
   // handle read-your-own-write is used.
   using Transaction::Get;
@@ -125,25 +189,42 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override;
 
+  using Transaction::MultiGet;
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+
   using Transaction::GetIterator;
   virtual Iterator* GetIterator(const ReadOptions& options) override;
   virtual Iterator* GetIterator(const ReadOptions& options,
                                 ColumnFamilyHandle* column_family) override;
 
+  virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                  const Slice& key,
+                                  SequenceNumber* tracked_at_seq) override;
+
  private:
   friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test;
   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
   friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test;
   friend class WriteUnpreparedTxnDB;
 
+  const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
+
   Status MaybeFlushWriteBatchToDB();
   Status FlushWriteBatchToDB(bool prepared);
+  Status FlushWriteBatchToDBInternal(bool prepared);
+  Status FlushWriteBatchWithSavePointToDB();
+  Status RollbackToSavePointInternal();
+  Status HandleWrite(std::function<Status()> do_write);
 
   // For write unprepared, we check on every writebatch append to see if
-  // max_write_batch_size_ has been exceeded, and then call
+  // write_batch_flush_threshold_ has been exceeded, and then call
   // FlushWriteBatchToDB if so. This logic is encapsulated in
   // MaybeFlushWriteBatchToDB.
-  size_t max_write_batch_size_;
+  int64_t write_batch_flush_threshold_;
   WriteUnpreparedTxnDB* wupt_db_;
 
   // Ordered list of unprep_seq sequence numbers that we have already written
@@ -157,10 +238,83 @@ class WriteUnpreparedTxn : public WritePreparedTxn {
   // commit callbacks.
   std::map<SequenceNumber, size_t> unprep_seqs_;
 
-  // Set of keys that have written to that have already been written to DB
-  // (ie. not in write_batch_).
+  uint64_t last_log_number_;
+
+  // Recovered transactions have tracked_keys_ populated, but are not actually
+  // locked for efficiency reasons. For recovered transactions, skip unlocking
+  // keys when transaction ends.
+  bool recovered_txn_;
+
+  // Track the largest sequence number at which we performed snapshot
+  // validation. If snapshot validation was skipped because no snapshot was set,
+  // then this is set to GetLastPublishedSequence. This value is useful because
+  // it means that for keys that have unprepared seqnos, we can guarantee that
+  // no committed keys by other transactions can exist between
+  // largest_validated_seq_ and max_unprep_seq. See
+  // WriteUnpreparedTxnDB::NewIterator for an explanation for why this is
+  // necessary for iterator Prev().
+  //
+  // Currently this value only increases during the lifetime of a transaction,
+  // but in some cases, we should be able to restore the previously largest
+  // value when calling RollbackToSavepoint.
+  SequenceNumber largest_validated_seq_;
+
+  struct SavePoint {
+    // Record of unprep_seqs_ at this savepoint. The set of unprep_seq is
+    // used during RollbackToSavepoint to determine visibility when restoring
+    // old values.
+    //
+    // TODO(lth): Since all unprep_seqs_ sets further down the stack must be
+    // subsets, this can potentially be deduplicated by just storing set
+    // difference. Investigate if this is worth it.
+    std::map<SequenceNumber, size_t> unprep_seqs_;
+
+    // This snapshot will be used to read keys at this savepoint if we call
+    // RollbackToSavePoint.
+    std::unique_ptr<ManagedSnapshot> snapshot_;
+
+    SavePoint(const std::map<SequenceNumber, size_t>& seqs,
+              ManagedSnapshot* snapshot)
+        : unprep_seqs_(seqs), snapshot_(snapshot){};
+  };
+
+  // We have 3 data structures holding savepoint information:
+  // 1. TransactionBaseImpl::save_points_
+  // 2. WriteUnpreparedTxn::flushed_save_points_
+  // 3. WriteUnpreparecTxn::unflushed_save_points_
+  //
+  // TransactionBaseImpl::save_points_ holds information about all write
+  // batches, including the current in-memory write_batch_, or unprepared
+  // batches that have been written out. Its responsibility is just to track
+  // which keys have been modified in every savepoint.
+  //
+  // WriteUnpreparedTxn::flushed_save_points_ holds information about savepoints
+  // set on unprepared batches that have already flushed. It holds the snapshot
+  // and unprep_seqs at that savepoint, so that the rollback process can
+  // determine which keys were visible at that point in time.
+  //
+  // WriteUnpreparecTxn::unflushed_save_points_ holds information about
+  // savepoints on the current in-memory write_batch_. It simply records the
+  // size of the write batch at every savepoint.
+  //
+  // TODO(lth): Remove the redundancy between save_point_boundaries_ and
+  // write_batch_.save_points_.
+  //
+  // Based on this information, here are some invariants:
+  // size(unflushed_save_points_) = size(write_batch_.save_points_)
+  // size(flushed_save_points_) + size(unflushed_save_points_)
+  //   = size(save_points_)
   //
-  std::map<uint32_t, std::vector<std::string>> write_set_keys_;
+  std::unique_ptr<autovector<WriteUnpreparedTxn::SavePoint>>
+      flushed_save_points_;
+  std::unique_ptr<autovector<size_t>> unflushed_save_points_;
+
+  // It is currently unsafe to flush a write batch if there are active iterators
+  // created from this transaction. This is because we use WriteBatchWithIndex
+  // to do merging reads from the DB and the write batch. If we flush the write
+  // batch, it is possible that the delta iterator on the iterator will point to
+  // invalid memory.
+  std::vector<Iterator*> active_iterators_;
 };
 
 }  // namespace rocksdb
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index 4fcbfbc37c5..a2290b8d4e2 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -5,11 +5,8 @@
 
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/write_unprepared_txn_db.h"
+#include "db/arena_wrapped_db_iter.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "util/cast_util.h"
 
@@ -46,7 +43,7 @@ Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
   };
 
   // Iterate starting with largest sequence number.
-  for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); it++) {
+  for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); ++it) {
     auto last_visible_txn = it->first - 1;
     const auto& batch = it->second.batch_;
     WriteBatch rollback_batch;
@@ -90,8 +87,12 @@ Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
         PinnableSlice pinnable_val;
         bool not_used;
         auto cf_handle = handles_[cf];
-        s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
-                         &callback);
+        DBImpl::GetImplOptions get_impl_options;
+        get_impl_options.column_family = cf_handle;
+        get_impl_options.value = &pinnable_val;
+        get_impl_options.value_found = &not_used;
+        get_impl_options.callback = &callback;
+        s = db_->GetImpl(roptions, key, get_impl_options);
         assert(s.ok() || s.IsNotFound());
         if (s.ok()) {
           s = rollback_batch_->Put(cf_handle, key, pinnable_val);
@@ -179,7 +180,7 @@ Status WriteUnpreparedTxnDB::Initialize(
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles) {
   // TODO(lth): Reduce code duplication in this function.
-  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
+  auto dbimpl = static_cast_with_check<DBImpl, DB>(GetRootDB());
   assert(dbimpl != nullptr);
 
   db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
@@ -189,8 +190,8 @@ Status WriteUnpreparedTxnDB::Initialize(
     explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
         : db_(db) {}
     Status Callback(SequenceNumber commit_seq,
-                    bool is_mem_disabled __attribute__((__unused__)),
-                    uint64_t) override {
+                    bool is_mem_disabled __attribute__((__unused__)), uint64_t,
+                    size_t /*index*/, size_t /*total*/) override {
       assert(!is_mem_disabled);
       db_->AddCommitted(commit_seq, commit_seq);
       return Status::OK();
@@ -229,6 +230,7 @@ Status WriteUnpreparedTxnDB::Initialize(
 
   // create 'real' transactions from recovered shell transactions
   auto rtxns = dbimpl->recovered_transactions();
+  std::map<SequenceNumber, SequenceNumber> ordered_seq_cnt;
   for (auto rtxn : rtxns) {
     auto recovered_trx = rtxn.second;
     assert(recovered_trx);
@@ -255,12 +257,13 @@ Status WriteUnpreparedTxnDB::Initialize(
     assert(real_trx);
     auto wupt =
         static_cast_with_check<WriteUnpreparedTxn, Transaction>(real_trx);
+    wupt->recovered_txn_ = true;
 
     real_trx->SetLogNumber(first_log_number);
     real_trx->SetId(first_seq);
     Status s = real_trx->SetName(recovered_trx->name_);
     if (!s.ok()) {
-      break;
+      return s;
     }
     wupt->prepare_batch_cnt_ = last_prepare_batch_cnt;
 
@@ -270,26 +273,31 @@ Status WriteUnpreparedTxnDB::Initialize(
       auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
       assert(batch_info.log_number_);
 
-      for (size_t i = 0; i < cnt; i++) {
-        AddPrepared(seq + i);
-      }
+      ordered_seq_cnt[seq] = cnt;
       assert(wupt->unprep_seqs_.count(seq) == 0);
       wupt->unprep_seqs_[seq] = cnt;
-      KeySetBuilder keyset_handler(wupt,
-                                   txn_db_options_.rollback_merge_operands);
-      s = batch_info.batch_->Iterate(&keyset_handler);
+
+      s = wupt->RebuildFromWriteBatch(batch_info.batch_);
       assert(s.ok());
       if (!s.ok()) {
-        break;
+        return s;
       }
     }
 
-    wupt->write_batch_.Clear();
-    WriteBatchInternal::InsertNoop(wupt->write_batch_.GetWriteBatch());
+    const bool kClear = true;
+    wupt->InitWriteBatch(kClear);
 
     real_trx->SetState(Transaction::PREPARED);
     if (!s.ok()) {
-      break;
+      return s;
+    }
+  }
+  // AddPrepared must be called in order
+  for (auto seq_cnt : ordered_seq_cnt) {
+    auto seq = seq_cnt.first;
+    auto cnt = seq_cnt.second;
+    for (size_t i = 0; i < cnt; i++) {
+      AddPrepared(seq + i);
     }
   }
 
@@ -345,7 +353,9 @@ struct WriteUnpreparedTxnDB::IteratorState {
   IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
                 std::shared_ptr<ManagedSnapshot> s,
                 SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn)
-      : callback(txn_db, sequence, min_uncommitted, txn), snapshot(s) {}
+      : callback(txn_db, sequence, min_uncommitted, txn->unprep_seqs_,
+                 kBackedByDBSnapshot),
+        snapshot(s) {}
   SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); }
 
   WriteUnpreparedTxnReadCallback callback;
@@ -365,25 +375,72 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
   constexpr bool ALLOW_BLOB = true;
   constexpr bool ALLOW_REFRESH = true;
   std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
-  SequenceNumber snapshot_seq;
+  SequenceNumber snapshot_seq = kMaxSequenceNumber;
   SequenceNumber min_uncommitted = 0;
-  if (options.snapshot != nullptr) {
-    snapshot_seq = options.snapshot->GetSequenceNumber();
-    min_uncommitted =
-        static_cast_with_check<const SnapshotImpl, const Snapshot>(
-            options.snapshot)
-            ->min_uncommitted_;
-  } else {
-    auto* snapshot = GetSnapshot();
-    // We take a snapshot to make sure that the related data in the commit map
-    // are not deleted.
-    snapshot_seq = snapshot->GetSequenceNumber();
-    min_uncommitted =
-        static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
-            ->min_uncommitted_;
+
+  // Currently, the Prev() iterator logic does not work well without snapshot
+  // validation. The logic simply iterates through values of a key in
+  // ascending seqno order, stopping at the first non-visible value and
+  // returning the last visible value.
+  //
+  // For example, if snapshot sequence is 3, and we have the following keys:
+  // foo: v1 1
+  // foo: v2 2
+  // foo: v3 3
+  // foo: v4 4
+  // foo: v5 5
+  //
+  // Then 1, 2, 3 will be visible, but 4 will be non-visible, so we return v3,
+  // which is the last visible value.
+  //
+  // For unprepared transactions, if we have snap_seq = 3, but the current
+  // transaction has unprep_seq 5, then returning the first non-visible value
+  // would be incorrect, as we should return v5, and not v3. The problem is that
+  // there are committed values at snapshot_seq < commit_seq < unprep_seq.
+  //
+  // Snapshot validation can prevent this problem by ensuring that no committed
+  // values exist at snapshot_seq < commit_seq, and thus any value with a
+  // sequence number greater than snapshot_seq must be unprepared values. For
+  // example, if the transaction had a snapshot at 3, then snapshot validation
+  // would be performed during the Put(v5) call. It would find v4, and the Put
+  // would fail with snapshot validation failure.
+  //
+  // TODO(lth): Improve Prev() logic to continue iterating until
+  // max_visible_seq, and then return the last visible value, so that this
+  // restriction can be lifted.
+  const Snapshot* snapshot = nullptr;
+  if (options.snapshot == nullptr) {
+    snapshot = GetSnapshot();
     own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+  } else {
+    snapshot = options.snapshot;
   }
+
+  snapshot_seq = snapshot->GetSequenceNumber();
   assert(snapshot_seq != kMaxSequenceNumber);
+  // Iteration is safe as long as largest_validated_seq <= snapshot_seq. We are
+  // guaranteed that for keys that were modified by this transaction (and thus
+  // might have unprepared values), no committed values exist at
+  // largest_validated_seq < commit_seq (or the contrapositive: any committed
+  // value must exist at commit_seq <= largest_validated_seq). This implies
+  // that commit_seq <= largest_validated_seq <= snapshot_seq or commit_seq <=
+  // snapshot_seq. As explained above, the problem with Prev() only happens when
+  // snapshot_seq < commit_seq.
+  //
+  // For keys that were not modified by this transaction, largest_validated_seq_
+  // is meaningless, and Prev() should just work with the existing visibility
+  // logic.
+  if (txn->largest_validated_seq_ > snapshot->GetSequenceNumber() &&
+      !txn->unprep_seqs_.empty()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "WriteUnprepared iterator creation failed since the "
+                    "transaction has performed unvalidated writes");
+    return nullptr;
+  }
+  min_uncommitted =
+      static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
+          ->min_uncommitted_;
+
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
   auto* state =
       new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn);
@@ -394,29 +451,5 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
   return db_iter;
 }
 
-Status KeySetBuilder::PutCF(uint32_t cf, const Slice& key,
-                            const Slice& /*val*/) {
-  txn_->UpdateWriteKeySet(cf, key);
-  return Status::OK();
-}
-
-Status KeySetBuilder::DeleteCF(uint32_t cf, const Slice& key) {
-  txn_->UpdateWriteKeySet(cf, key);
-  return Status::OK();
-}
-
-Status KeySetBuilder::SingleDeleteCF(uint32_t cf, const Slice& key) {
-  txn_->UpdateWriteKeySet(cf, key);
-  return Status::OK();
-}
-
-Status KeySetBuilder::MergeCF(uint32_t cf, const Slice& key,
-                              const Slice& /*val*/) {
-  if (rollback_merge_operands_) {
-    txn_->UpdateWriteKeySet(cf, key);
-  }
-  return Status::OK();
-}
-
 }  //  namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/write_unprepared_txn_db.h b/utilities/transactions/write_unprepared_txn_db.h
index 4b4e31e1b60..65cb4b9195a 100644
--- a/utilities/transactions/write_unprepared_txn_db.h
+++ b/utilities/transactions/write_unprepared_txn_db.h
@@ -6,10 +6,6 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
 #include "utilities/transactions/write_prepared_txn_db.h"
 #include "utilities/transactions/write_unprepared_txn.h"
 
@@ -61,7 +57,8 @@ class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
 
   virtual Status Callback(SequenceNumber commit_seq,
                           bool is_mem_disabled __attribute__((__unused__)),
-                          uint64_t) override {
+                          uint64_t, size_t /*index*/,
+                          size_t /*total*/) override {
     const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
                                          ? commit_seq
                                          : commit_seq + data_batch_cnt_ - 1;
@@ -125,7 +122,8 @@ class WriteUnpreparedRollbackPreReleaseCallback : public PreReleaseCallback {
 
   virtual Status Callback(SequenceNumber commit_seq,
                           bool is_mem_disabled __attribute__((__unused__)),
-                          uint64_t) override {
+                          uint64_t, size_t /*index*/,
+                          size_t /*total*/) override {
     assert(is_mem_disabled);  // implies the 2nd queue
     const uint64_t last_commit_seq = commit_seq;
     db_->AddCommitted(rollback_seq_, last_commit_seq);
@@ -146,32 +144,5 @@ class WriteUnpreparedRollbackPreReleaseCallback : public PreReleaseCallback {
   SequenceNumber rollback_seq_;
 };
 
-struct KeySetBuilder : public WriteBatch::Handler {
-  WriteUnpreparedTxn* txn_;
-  bool rollback_merge_operands_;
-
-  KeySetBuilder(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
-      : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
-
-  Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override;
-
-  Status DeleteCF(uint32_t cf, const Slice& key) override;
-
-  Status SingleDeleteCF(uint32_t cf, const Slice& key) override;
-
-  Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override;
-
-  // Recovered batches do not contain 2PC markers.
-  Status MarkNoop(bool) override { return Status::InvalidArgument(); }
-  Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
-  Status MarkEndPrepare(const Slice&) override {
-    return Status::InvalidArgument();
-  }
-  Status MarkCommit(const Slice&) override { return Status::InvalidArgument(); }
-  Status MarkRollback(const Slice&) override {
-    return Status::InvalidArgument();
-  }
-};
-
 }  //  namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
index 1952e6188d6..2c79d01ba12 100644
--- a/utilities/ttl/db_ttl_impl.cc
+++ b/utilities/ttl/db_ttl_impl.cc
@@ -7,12 +7,12 @@
 #include "utilities/ttl/db_ttl_impl.h"
 
 #include "db/write_batch_internal.h"
+#include "file/filename.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "util/coding.h"
-#include "util/filename.h"
 
 namespace rocksdb {
 
@@ -34,12 +34,25 @@ void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
 }
 
 // Open the db inside DBWithTTLImpl because options needs pointer to its ttl
-DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db) {}
+DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {}
 
 DBWithTTLImpl::~DBWithTTLImpl() {
-  // Need to stop background compaction before getting rid of the filter
-  CancelAllBackgroundWork(db_, /* wait = */ true);
-  delete GetOptions().compaction_filter;
+  if (!closed_) {
+    Close();
+  }
+}
+
+Status DBWithTTLImpl::Close() {
+  Status ret = Status::OK();
+  if (!closed_) {
+    Options default_options = GetOptions();
+    // Need to stop background compaction before getting rid of the filter
+    CancelAllBackgroundWork(db_, /* wait = */ true);
+    ret = db_->Close();
+    delete default_options.compaction_filter;
+    closed_ = true;
+  }
+  return ret;
 }
 
 Status UtilityDB::OpenTtlDB(const Options& options, const std::string& dbname,
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 8bf064a0466..04258080204 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -10,13 +10,13 @@
 #include <string>
 #include <vector>
 
+#include "db/db_impl/db_impl.h"
+#include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "rocksdb/compaction_filter.h"
 #include "rocksdb/merge_operator.h"
-#include "rocksdb/utilities/utility_db.h"
 #include "rocksdb/utilities/db_ttl.h"
-#include "db/db_impl.h"
+#include "rocksdb/utilities/utility_db.h"
 
 #ifdef _WIN32
 // Windows API macro interference
@@ -35,6 +35,8 @@ class DBWithTTLImpl : public DBWithTTL {
 
   virtual ~DBWithTTLImpl();
 
+  virtual Status Close() override;
+
   Status CreateColumnFamilyWithTtl(const ColumnFamilyOptions& options,
                                    const std::string& column_family_name,
                                    ColumnFamilyHandle** handle,
@@ -99,6 +101,10 @@ class DBWithTTLImpl : public DBWithTTL {
   void SetTtl(int32_t ttl) override { SetTtl(DefaultColumnFamily(), ttl); }
 
   void SetTtl(ColumnFamilyHandle *h, int32_t ttl) override;
+
+ private:
+  // remember whether the Close completes or not
+  bool closed_;
 };
 
 class TtlIterator : public Iterator {
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 88e90af269c..f95ce4b79c3 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -9,8 +9,8 @@
 #include <memory>
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
@@ -86,9 +86,20 @@ class TtlTest : public testing::Test {
     ASSERT_OK(DBWithTTL::Open(options_, dbname_, &db_ttl_, ttl, true));
   }
 
-  void CloseTtl() {
-    delete db_ttl_;
-    db_ttl_ = nullptr;
+  // Call db_ttl_->Close() before delete db_ttl_
+  void CloseTtl() { CloseTtlHelper(true); }
+
+  // No db_ttl_->Close() before delete db_ttl_
+  void CloseTtlNoDBClose() { CloseTtlHelper(false); }
+
+  void CloseTtlHelper(bool close_db) {
+    if (db_ttl_ != nullptr) {
+      if (close_db) {
+        db_ttl_->Close();
+      }
+      delete db_ttl_;
+      db_ttl_ = nullptr;
+    }
   }
 
   // Populates and returns a kv-map
@@ -401,6 +412,29 @@ TEST_F(TtlTest, NoEffect) {
   CloseTtl();
 }
 
+// Rerun the NoEffect test with a different version of CloseTtl
+// function, where db is directly deleted without close.
+TEST_F(TtlTest, DestructWithoutClose) {
+  MakeKVMap(kSampleSize_);
+  int64_t boundary1 = kSampleSize_ / 3;
+  int64_t boundary2 = 2 * boundary1;
+
+  OpenTtl();
+  PutValues(0, boundary1);             // T=0: Set1 never deleted
+  SleepCompactCheck(1, 0, boundary1);  // T=1: Set1 still there
+  CloseTtlNoDBClose();
+
+  OpenTtl(0);
+  PutValues(boundary1, boundary2 - boundary1);  // T=1: Set2 never deleted
+  SleepCompactCheck(1, 0, boundary2);           // T=2: Sets1 & 2 still there
+  CloseTtlNoDBClose();
+
+  OpenTtl(-1);
+  PutValues(boundary2, kSampleSize_ - boundary2);  // T=3: Set3 never deleted
+  SleepCompactCheck(1, 0, kSampleSize_, true);  // T=4: Sets 1,2,3 still there
+  CloseTtlNoDBClose();
+}
+
 // Puts a set of values and checks its presence using Get during ttl
 TEST_F(TtlTest, PresentDuringTTL) {
   MakeKVMap(kSampleSize_);
diff --git a/utilities/util_merge_operators_test.cc b/utilities/util_merge_operators_test.cc
index d8b3cfba69c..d591ac8f12c 100644
--- a/utilities/util_merge_operators_test.cc
+++ b/utilities/util_merge_operators_test.cc
@@ -3,8 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "util/testharness.h"
-#include "util/testutil.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "utilities/merge_operators.h"
 
 namespace rocksdb {
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index adec3475cdd..fe99e43e625 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -10,14 +10,14 @@
 #include <memory>
 
 #include "db/column_family.h"
-#include "db/db_impl.h"
+#include "db/db_impl/db_impl.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
+#include "memory/arena.h"
 #include "memtable/skiplist.h"
 #include "options/db_options.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
-#include "util/arena.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
 #include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
@@ -33,14 +33,17 @@ namespace rocksdb {
 class BaseDeltaIterator : public Iterator {
  public:
   BaseDeltaIterator(Iterator* base_iterator, WBWIIterator* delta_iterator,
-                    const Comparator* comparator)
+                    const Comparator* comparator,
+                    const ReadOptions* read_options = nullptr)
       : forward_(true),
         current_at_base_(true),
         equal_keys_(false),
         status_(Status::OK()),
         base_iterator_(base_iterator),
         delta_iterator_(delta_iterator),
-        comparator_(comparator) {}
+        comparator_(comparator),
+        iterate_upper_bound_(read_options ? read_options->iterate_upper_bound
+                                          : nullptr) {}
 
   ~BaseDeltaIterator() override {}
 
@@ -279,6 +282,13 @@ class BaseDeltaIterator : public Iterator {
           // Finished
           return;
         }
+        if (iterate_upper_bound_) {
+          if (comparator_->Compare(delta_entry.key, *iterate_upper_bound_) >=
+              0) {
+            // out of upper bound -> finished.
+            return;
+          }
+        }
         if (delta_entry.type == kDeleteRecord ||
             delta_entry.type == kSingleDeleteRecord) {
           AdvanceDelta();
@@ -326,6 +336,7 @@ class BaseDeltaIterator : public Iterator {
   std::unique_ptr<Iterator> base_iterator_;
   std::unique_ptr<WBWIIterator> delta_iterator_;
   const Comparator* comparator_;  // not owned
+  const Slice* iterate_upper_bound_;
 };
 
 typedef SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>
@@ -568,7 +579,7 @@ Status WriteBatchWithIndex::Rep::ReBuildIndex() {
   input.remove_prefix(offset);
 
   // Loop through all entries in Rep and add each one to the index
-  int found = 0;
+  uint32_t found = 0;
   while (s.ok() && !input.empty()) {
     Slice key, value, blob, xid;
     uint32_t column_family_id = 0;  // default
@@ -627,6 +638,11 @@ WriteBatchWithIndex::WriteBatchWithIndex(
 
 WriteBatchWithIndex::~WriteBatchWithIndex() {}
 
+WriteBatchWithIndex::WriteBatchWithIndex(WriteBatchWithIndex&&) = default;
+
+WriteBatchWithIndex& WriteBatchWithIndex::operator=(WriteBatchWithIndex&&) =
+    default;
+
 WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
 
 size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; }
@@ -642,13 +658,15 @@ WBWIIterator* WriteBatchWithIndex::NewIterator(
 }
 
 Iterator* WriteBatchWithIndex::NewIteratorWithBase(
-    ColumnFamilyHandle* column_family, Iterator* base_iterator) {
+    ColumnFamilyHandle* column_family, Iterator* base_iterator,
+    const ReadOptions* read_options) {
   if (rep->overwrite_key == false) {
     assert(false);
     return nullptr;
   }
   return new BaseDeltaIterator(base_iterator, NewIterator(column_family),
-                               GetColumnFamilyUserComparator(column_family));
+                               GetColumnFamilyUserComparator(column_family),
+                               read_options);
 }
 
 Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {
@@ -886,9 +904,12 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(
   if (!callback) {
     s = db->Get(read_options, column_family, key, pinnable_val);
   } else {
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = column_family;
+    get_impl_options.value = pinnable_val;
+    get_impl_options.callback = callback;
     s = static_cast_with_check<DBImpl, DB>(db->GetRootDB())
-            ->GetImpl(read_options, column_family, key, pinnable_val, nullptr,
-                      callback);
+            ->GetImpl(read_options, key, get_impl_options);
   }
 
   if (s.ok() || s.IsNotFound()) {  // DB Get Succeeded
@@ -909,9 +930,12 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(
       }
 
       if (merge_operator) {
-        s = MergeHelper::TimedFullMerge(
-            merge_operator, key, merge_data, merge_context.GetOperands(),
-            pinnable_val->GetSelf(), logger, statistics, env);
+        std::string merge_result;
+        s = MergeHelper::TimedFullMerge(merge_operator, key, merge_data,
+                                        merge_context.GetOperands(),
+                                        &merge_result, logger, statistics, env);
+        pinnable_val->Reset();
+        *pinnable_val->GetSelf() = std::move(merge_result);
         pinnable_val->PinSelf();
       } else {
         s = Status::InvalidArgument("Options::merge_operator must be set");
@@ -939,6 +963,7 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB(
           ->immutable_db_options();
 
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
   // To hold merges from the write batch
   autovector<std::pair<WriteBatchWithIndexInternal::Result, MergeContext>,
              MultiGetContext::MAX_BATCH_SIZE>
@@ -978,14 +1003,17 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB(
 
     assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
            result == WriteBatchWithIndexInternal::Result::kNotFound);
-    key_context.emplace_back(keys[i], &values[i], &statuses[i]);
+    key_context.emplace_back(column_family, keys[i], &values[i], &statuses[i]);
+    sorted_keys.emplace_back(&key_context.back());
     merges.emplace_back(result, std::move(merge_context));
   }
 
   // Did not find key in batch OR could not resolve Merges.  Try DB.
   static_cast_with_check<DBImpl, DB>(db->GetRootDB())
-      ->MultiGetImpl(read_options, column_family, key_context, sorted_input,
-                     callback);
+      ->PrepareMultiGetKeys(key_context.size(), sorted_input, &sorted_keys);
+  static_cast_with_check<DBImpl, DB>(db->GetRootDB())
+      ->MultiGetWithCallback(read_options, column_family, callback,
+                             &sorted_keys);
 
   ColumnFamilyHandleImpl* cfh =
       reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index be715fe32ca..ed28fa8fd27 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -9,14 +9,14 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <memory>
+#include "rocksdb/utilities/write_batch_with_index.h"
 #include <map>
+#include <memory>
 #include "db/column_family.h"
 #include "port/stack_trace.h"
-#include "rocksdb/utilities/write_batch_with_index.h"
+#include "test_util/testharness.h"
 #include "util/random.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
 
@@ -1308,6 +1308,37 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge2) {
   DestroyDB(dbname, options);
 }
 
+TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
+  DB* db;
+  Options options;
+
+  options.create_if_missing = true;
+  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
+
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  DestroyDB(dbname, options);
+  Status s = DB::Open(options, dbname, &db);
+  assert(s.ok());
+
+  ReadOptions read_options;
+  WriteOptions write_options;
+  FlushOptions flush_options;
+  std::string value;
+
+  WriteBatchWithIndex batch;
+
+  ASSERT_OK(db->Put(write_options, "A", "1"));
+  ASSERT_OK(db->Flush(flush_options, db->DefaultColumnFamily()));
+  ASSERT_OK(batch.Merge("A", "2"));
+
+  ASSERT_OK(batch.GetFromBatchAndDB(db, read_options, "A", &value));
+  ASSERT_EQ(value, "1,2");
+
+  delete db;
+  DestroyDB(dbname, options);
+}
+
 void AssertKey(std::string key, WBWIIterator* iter) {
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ(key, iter->Entry().key.ToString());